Spaces:

joshuasundance
/

geospatial-data-converter

Running

App Files Files Community

Joshua Sundance Bailey commited on Oct 12, 2023

Commit

c6718c6

1 Parent(s): 11a4b10

messy old code

Browse files

Files changed (4) hide show

geospatial-data-converter/kml_tricks.py +122 -55
geospatial-data-converter/kml_tricks_before.py +0 -141
geospatial-data-converter/kml_tricks_refactor.py +0 -171
geospatial-data-converter/utils.py +2 -4

geospatial-data-converter/kml_tricks.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import zipfile
 from io import StringIO
-from typing import Any
 import bs4
 import geopandas as gpd
@@ -8,100 +7,168 @@ import lxml  # nosec
 import pandas as pd
-def parse_description_to_gdf(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
-    def _gen():
-        for desc in gdf["Description"]:
-            try:
-                html_df = pd.read_html(StringIO(desc), flavor="lxml")
-                yield html_df[-1].T
-            except (lxml.etree.ParserError, lxml.etree.XMLSyntaxError) as e:
-                raise pd.errors.ParserError from e
-    parsed_dataframes = list(_gen())
-    for df in parsed_dataframes:
-        df.columns = df.iloc[0]
-        df.drop(df.index[0], inplace=True)
-    combined_df = pd.concat(parsed_dataframes, ignore_index=True)
-    combined_df["geometry"] = gdf["geometry"]
-    return gpd.GeoDataFrame(combined_df, crs=gdf.crs)
-def read_kml_file(path: str) -> Any:
-    with zipfile.ZipFile(path, "r") as kmz:
-        kml_files = [f for f in kmz.namelist() if f.endswith(".kml")]
     if len(kml_files) != 1:
         raise IndexError(
-            "KMZ contains more than one KML. Extract or convert to multiple KMLs.",
         )
-    return gpd.read_file(
-        f"zip://{path}\\{kml_files[0]}",
         driver="KML",
         engine="pyogrio",
     )
-def parse_file_to_gdf(path: str) -> gpd.GeoDataFrame:
-    if path.endswith(".kml"):
-        return parse_description_to_gdf(
-            gpd.read_file(path, driver="KML", engine="pyogrio"),
-        )
-    if path.endswith(".kmz"):
-        return parse_description_to_gdf(read_kml_file(path))
-    raise ValueError("File must end with .kml or .kmz")
 def extract_data_from_kml_code(kml_code: str) -> pd.DataFrame:
-    soup = bs4.BeautifulSoup(kml_code, features="xml")
-    rows = soup.find_all("schemadata")
-    data = (
-        {field.get("name"): field.text for field in row.find_all("simpledata")}
-        for row in rows
     )
-    return pd.DataFrame(data)
-def extract_kml_from_file(file_path: str) -> str:
     file_extension = file_path.lower().split(".")[-1]
     if file_extension == "kml":
-        with open(file_path, "r") as kml:
-            return kml.read()
-    if file_extension == "kmz":
-        with zipfile.ZipFile(file_path) as kmz:
-            kml_files = [f for f in kmz.namelist() if f.lower().endswith(".kml")]
             if len(kml_files) != 1:
                 raise IndexError(
-                    "KMZ contains more than one KML. Extract or convert to multiple KMLs.",
                 )
-            with kmz.open(kml_files[0]) as kml:
-                return kml.read().decode()
-    raise ValueError("File path must end with .kml or .kmz")
-def extract_data_from_file(file_path: str) -> gpd.GeoDataFrame:
-    df = extract_data_from_kml_code(extract_kml_from_file(file_path))
     if file_path.endswith(".kmz"):
-        file_gdf = read_kml_file(file_path)
     else:
-        file_gdf = gpd.read_file(file_path, driver="KML", engine="pyogrio")
-    return gpd.GeoDataFrame(df, geometry=file_gdf["geometry"], crs=file_gdf.crs)
-def read_ge_file(file_path: str) -> gpd.GeoDataFrame:
     try:
-        return parse_file_to_gdf(file_path)
-    except (pd.errors.ParserError, ValueError):
-        return extract_data_from_file(file_path)

 import zipfile
 from io import StringIO
 import bs4
 import geopandas as gpd
 import pandas as pd
+def parse_descriptions_to_geodf(geodf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
+    """Parses Descriptions from Google Earth file to a GeoDataFrame object"""
+    dataframes = []
+    # Iterate over descriptions and extract data
+    for desc in geodf["Description"]:
+        desc_as_io = StringIO(desc)
+        # Try to read the description into a DataFrame
+        parsed_html = pd.read_html(desc_as_io)
+        try:
+            temp_df = parsed_html[1].T
+        except IndexError:
+            temp_df = parsed_html[0].T
+        # Set DataFrame header and remove the first row
+        temp_df.columns = temp_df.iloc[0]
+        temp_df = temp_df.iloc[1:]
+        dataframes.append(temp_df)
+    # Combine all DataFrames
+    combined_df = pd.concat(dataframes, ignore_index=True)
+    # Add geometry data
+    combined_df["geometry"] = geodf["geometry"]
+    # Create a GeoDataFrame with the combined data and original CRS
+    result_geodf = gpd.GeoDataFrame(combined_df, crs=geodf.crs)
+    return result_geodf
+def load_kmz_as_geodf(file_path: str) -> gpd.GeoDataFrame:
+    """Loads a KMZ file into a GeoPandas DataFrame, assuming the KMZ contains one KML file"""
+    # Open the KMZ file
+    with zipfile.ZipFile(file_path, "r") as kmz:
+        # List all KML files in the KMZ
+        kml_files = [file for file in kmz.namelist() if file.endswith(".kml")]
+    # Ensure there's only one KML file in the KMZ
     if len(kml_files) != 1:
         raise IndexError(
+            "KMZ contains more than one KML. Please extract or convert to multiple KMLs.",
         )
+    # Read the KML file into a GeoDataFrame
+    geodf = gpd.read_file(
+        f"zip://{file_path}/{kml_files[0]}",
         driver="KML",
         engine="pyogrio",
     )
+    return geodf
+def load_ge_file(file_path: str) -> gpd.GeoDataFrame:
+    """Loads a KML or KMZ file and parses its descriptions into a GeoDataFrame"""
+    if file_path.endswith(".kml"):
+        return parse_descriptions_to_geodf(
+            gpd.read_file(file_path, driver="KML", engine="pyogrio"),
+        )
+    elif file_path.endswith(".kmz"):
+        return parse_descriptions_to_geodf(load_kmz_as_geodf(file_path))
+    raise ValueError("The file must have a .kml or .kmz extension.")
 def extract_data_from_kml_code(kml_code: str) -> pd.DataFrame:
+    """Extracts data from KML code into a DataFrame using SimpleData tags, excluding embedded tables in feature descriptions"""
+    # Parse the KML source code
+    soup = bs4.BeautifulSoup(kml_code, "html.parser")
+    # Find all SchemaData tags (representing rows)
+    schema_data_tags = soup.find_all("schemadata")
+    # Create a generator that yields a dictionary for each row, containing the Placemark name and each SimpleData field
+    row_dicts = (
+        {
+            "Placemark_name": tag.parent.parent.find("name").text,
+            **{field.get("name"): field.text for field in tag.find_all("simpledata")},
+        }
+        for tag in schema_data_tags
     )
+    # Convert the row dictionaries into a DataFrame
+    df = pd.DataFrame(row_dicts)
+    return df
+def extract_kml_code_from_file(file_path: str) -> str:
+    """Extracts KML source code from a Google Earth file (KML or KMZ)"""
     file_extension = file_path.lower().split(".")[-1]
     if file_extension == "kml":
+        with open(file_path, "r") as kml_file:
+            kml_code = kml_file.read()
+    elif file_extension == "kmz":
+        with zipfile.ZipFile(file_path) as kmz_file:
+            # Find all KML files in the KMZ
+            kml_files = [
+                file for file in kmz_file.namelist() if file.lower().endswith(".kml")
+            ]
             if len(kml_files) != 1:
                 raise IndexError(
+                    "KMZ file contains more than one KML. Please extract or convert to multiple KMLs.",
                 )
+            with kmz_file.open(kml_files[0]) as kml_file:
+                # Decode the KML file's content from bytes to string
+                kml_code = kml_file.read().decode()
+    else:
+        raise ValueError("The input file must have a .kml or .kmz extension.")
+    return kml_code
+def extract_data_from_ge_file(file_path: str) -> gpd.GeoDataFrame:
+    """Extracts data from a Google Earth file (KML or KMZ) into a GeoDataFrame using SimpleData tags, excluding embedded tables in feature descriptions"""
+    data_df = extract_data_from_kml_code(extract_kml_code_from_file(file_path))
     if file_path.endswith(".kmz"):
+        ge_file_gdf = load_kmz_as_geodf(file_path)
     else:
+        ge_file_gdf = gpd.read_file(file_path, driver="KML", engine="pyogrio")
+    geo_df = gpd.GeoDataFrame(
+        data_df,
+        geometry=ge_file_gdf["geometry"],
+        crs=ge_file_gdf.crs,
+    )
+    return geo_df
+def load_ge_data(file_path: str) -> gpd.GeoDataFrame:
+    """Extracts data from a Google Earth file (KML or KMZ) and handles errors due to parsing issues"""
+    kml_code = extract_kml_code_from_file(file_path)
+    # Choose the extraction method based on the presence of SimpleData or SimpleField tags in the KML code
+    primary_func, fallback_func = (
+        (extract_data_from_ge_file, load_ge_file)
+        if any(tag in kml_code.lower() for tag in ("<simpledata", "<simplefield"))
+        else (load_ge_file, extract_data_from_ge_file)
+    )
     try:
+        data_df = primary_func(file_path)
+    except (
+        pd.errors.ParserError,
+        lxml.etree.ParserError,
+        lxml.etree.XMLSyntaxError,
+        ValueError,
+    ):
+        data_df = fallback_func(file_path)
+    return data_df

geospatial-data-converter/kml_tricks_before.py DELETED Viewed

@@ -1,141 +0,0 @@
-import zipfile
-from io import StringIO
-import bs4
-import fiona
-import geopandas as gpd
-import lxml  # nosec
-import pandas as pd
-fiona.drvsupport.supported_drivers["KML"] = "rw"
-def desctogdf(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
-    """Parses Descriptions from Google Earth file to create a legit gpd.GeoDataFrame"""
-    dfs = []
-    len(gdf)
-    # pull chunks of data from feature descriptions
-    for idx, desc in enumerate(gdf["Description"], start=1):
-        desc = StringIO(desc)
-        try:
-            tmpdf = pd.read_html(desc)[1].T
-        except IndexError:
-            tmpdf = pd.read_html(desc)[0].T
-        tmpdf.columns = tmpdf.iloc[0]
-        tmpdf = tmpdf.iloc[1:]
-        dfs.append(tmpdf)
-    # join chunks together
-    ccdf = pd.concat(dfs, ignore_index=True)
-    ccdf["geometry"] = gdf["geometry"]
-    df = gpd.GeoDataFrame(ccdf, crs=gdf.crs)
-    return df
-def readkmz(path: str) -> gpd.GeoDataFrame:
-    """Simply read kmz using geopandas/fiona without parsing Descriptions"""
-    # get name of kml in kmz (should be doc.kml but we don't assume)
-    with zipfile.ZipFile(path, "r") as kmz:
-        namelist = [f for f in kmz.namelist() if f.endswith(".kml")]
-    if len(namelist) != 1:
-        # this should never really happen
-        raise IndexError(
-            "kmz contains more than one kml. Extract or convert to multiple kmls.",
-        )
-    # return GeoDataFrame by reading contents of kmz
-    return gpd.read_file("zip://{}\\{}".format(path, namelist[0]), driver="KML")
-def ge_togdf(gefile: str) -> gpd.GeoDataFrame:
-    """Return gpd.GeoDataFrame after reading kmz or kml and parsing Descriptions"""
-    if gefile.endswith(".kml"):
-        gdf = desctogdf(gpd.read_file(gefile, driver="KML"))
-    elif gefile.endswith(".kmz"):
-        gdf = desctogdf(readkmz(gefile))
-    else:
-        raise ValueError("File must end with .kml or .kmz")
-    return gdf
-def simpledata_fromcode(kmlcode: str) -> pd.DataFrame:
-    """Return DataFrame extracted from KML code
-    parameter kmlcode (str): kml source code
-    Uses simpledata tags, NOT embedded tables in feature descriptions
-    """
-    # get the KML source code as a BeautifulSoup object
-    soup = bs4.BeautifulSoup(kmlcode, "html.parser")
-    # find all rows (schemadata tags) in the soup
-    rowtags = soup.find_all("schemadata")
-    # generator expression yielding a {name: value} dict for each row
-    rowdicts = (
-        {
-            **{"Placemark_name": row.parent.parent.find("name").text},
-            **{field.get("name"): field.text for field in row.find_all("simpledata")},
-        }
-        for row in rowtags
-    )
-    # return pd.DataFrame from row dict generator
-    return pd.DataFrame(rowdicts)
-def kmlcode_fromfile(gefile: str) -> str:
-    """Return kml source code (str) extracted from Google Earth File
-    parameter gefile (str): absolute or relative path to Google Earth file
-    (kmz or kml)
-    Uses simpledata tags, NOT embedded tables in feature descriptions
-    """
-    fileextension = gefile.lower().split(".")[-1]
-    if fileextension == "kml":
-        with open(gefile, "r") as kml:
-            kmlsrc = kml.read()
-    elif fileextension == "kmz":
-        with zipfile.ZipFile(gefile) as kmz:
-            # there should only be one kml file and it should be named doc.kml
-            # we won't make that assumption
-            kmls = [f for f in kmz.namelist() if f.lower().endswith(".kml")]
-            if len(kmls) != 1:
-                raise IndexError(
-                    "kmz contains more than one kml. Extract or convert to multiple kmls.",
-                )
-            with kmz.open(kmls[0]) as kml:
-                # .decode() because zipfile.ZipFile.open(name).read() -> bytes
-                kmlsrc = kml.read().decode()
-    else:
-        raise ValueError("parameter gefile must end with .kml or .kmz")
-    return kmlsrc
-def simpledata_fromfile(gefile: str) -> gpd.GeoDataFrame:
-    """Return DataFrame extracted from Google Earth File
-    parameter gefile (str): absolute or relative path to Google Earth file
-    (kmz or kml)
-    Uses simpledata tags, NOT embedded tables in feature descriptions
-    """
-    df = simpledata_fromcode(kmlcode_fromfile(gefile))
-    if gefile.endswith(".kmz"):
-        gefile_gdf = readkmz(gefile)
-    else:
-        gefile_gdf = gpd.read_file(gefile, driver="KML")
-    gdf = gpd.GeoDataFrame(df, geometry=gefile_gdf["geometry"], crs=gefile_gdf.crs)
-    return gdf
-def readge(gefile: str) -> pd.DataFrame:
-    """Extract data from Google Earth file & save as zip
-    parameter gefile (str): absolute or relative path to Google Earth file
-    parameter zipfile (str): absolute or relative path to output zip file
-    Will read simpledata tags OR embedded tables in feature descriptions
-    """
-    code = kmlcode_fromfile(gefile)
-    func1, func2 = ge_togdf, simpledata_fromfile
-    if any((tag in code.lower() for tag in ("<simpledata", "<simplefield"))):
-        func1, func2 = func2, func1
-    try:
-        df = func1(gefile)
-    except (
-        pd.errors.ParserError,
-        lxml.etree.ParserError,
-        lxml.etree.XMLSyntaxError,
-        ValueError,
-    ):
-        df = func2(gefile)
-    return df

geospatial-data-converter/kml_tricks_refactor.py DELETED Viewed

@@ -1,171 +0,0 @@
-import zipfile
-from io import StringIO
-import bs4
-import fiona
-import geopandas as gpd
-import lxml  # nosec
-import pandas as pd
-fiona.drvsupport.supported_drivers["KML"] = "rw"
-def parse_descriptions_to_geodf(geodf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
-    """Parses Descriptions from Google Earth file to a GeoDataFrame object"""
-    dataframes = []
-    # Iterate over descriptions and extract data
-    for desc in geodf["Description"]:
-        desc_as_io = StringIO(desc)
-        # Try to read the description into a DataFrame
-        parsed_html = pd.read_html(desc_as_io)
-        try:
-            temp_df = parsed_html[1].T
-        except IndexError:
-            temp_df = parsed_html[0].T
-        # Set DataFrame header and remove the first row
-        temp_df.columns = temp_df.iloc[0]
-        temp_df = temp_df.iloc[1:]
-        dataframes.append(temp_df)
-    # Combine all DataFrames
-    combined_df = pd.concat(dataframes, ignore_index=True)
-    # Add geometry data
-    combined_df["geometry"] = geodf["geometry"]
-    # Create a GeoDataFrame with the combined data and original CRS
-    result_geodf = gpd.GeoDataFrame(combined_df, crs=geodf.crs)
-    return result_geodf
-def load_kmz_as_geodf(file_path: str) -> gpd.GeoDataFrame:
-    """Loads a KMZ file into a GeoPandas DataFrame, assuming the KMZ contains one KML file"""
-    # Open the KMZ file
-    with zipfile.ZipFile(file_path, "r") as kmz:
-        # List all KML files in the KMZ
-        kml_files = [file for file in kmz.namelist() if file.endswith(".kml")]
-    # Ensure there's only one KML file in the KMZ
-    if len(kml_files) != 1:
-        raise IndexError(
-            "KMZ contains more than one KML. Please extract or convert to multiple KMLs.",
-        )
-    # Read the KML file into a GeoDataFrame
-    geodf = gpd.read_file(f"zip://{file_path}/{kml_files[0]}", driver="KML")
-    return geodf
-def load_ge_file(file_path: str) -> gpd.GeoDataFrame:
-    """Loads a KML or KMZ file and parses its descriptions into a GeoDataFrame"""
-    if file_path.endswith(".kml"):
-        return parse_descriptions_to_geodf(gpd.read_file(file_path, driver="KML"))
-    elif file_path.endswith(".kmz"):
-        return parse_descriptions_to_geodf(load_kmz_as_geodf(file_path))
-    raise ValueError("The file must have a .kml or .kmz extension.")
-def extract_data_from_kml_code(kml_code: str) -> pd.DataFrame:
-    """Extracts data from KML code into a DataFrame using SimpleData tags, excluding embedded tables in feature descriptions"""
-    # Parse the KML source code
-    soup = bs4.BeautifulSoup(kml_code, "html.parser")
-    # Find all SchemaData tags (representing rows)
-    schema_data_tags = soup.find_all("schemadata")
-    # Create a generator that yields a dictionary for each row, containing the Placemark name and each SimpleData field
-    row_dicts = (
-        {
-            "Placemark_name": tag.parent.parent.find("name").text,
-            **{field.get("name"): field.text for field in tag.find_all("simpledata")},
-        }
-        for tag in schema_data_tags
-    )
-    # Convert the row dictionaries into a DataFrame
-    df = pd.DataFrame(row_dicts)
-    return df
-def extract_kml_code_from_file(file_path: str) -> str:
-    """Extracts KML source code from a Google Earth file (KML or KMZ)"""
-    file_extension = file_path.lower().split(".")[-1]
-    if file_extension == "kml":
-        with open(file_path, "r") as kml_file:
-            kml_code = kml_file.read()
-    elif file_extension == "kmz":
-        with zipfile.ZipFile(file_path) as kmz_file:
-            # Find all KML files in the KMZ
-            kml_files = [
-                file for file in kmz_file.namelist() if file.lower().endswith(".kml")
-            ]
-            if len(kml_files) != 1:
-                raise IndexError(
-                    "KMZ file contains more than one KML. Please extract or convert to multiple KMLs.",
-                )
-            with kmz_file.open(kml_files[0]) as kml_file:
-                # Decode the KML file's content from bytes to string
-                kml_code = kml_file.read().decode()
-    else:
-        raise ValueError("The input file must have a .kml or .kmz extension.")
-    return kml_code
-def extract_data_from_ge_file(file_path: str) -> gpd.GeoDataFrame:
-    """Extracts data from a Google Earth file (KML or KMZ) into a GeoDataFrame using SimpleData tags, excluding embedded tables in feature descriptions"""
-    data_df = extract_data_from_kml_code(extract_kml_code_from_file(file_path))
-    if file_path.endswith(".kmz"):
-        ge_file_gdf = load_kmz_as_geodf(file_path)
-    else:
-        ge_file_gdf = gpd.read_file(file_path, driver="KML")
-    geo_df = gpd.GeoDataFrame(
-        data_df,
-        geometry=ge_file_gdf["geometry"],
-        crs=ge_file_gdf.crs,
-    )
-    return geo_df
-def load_ge_data(file_path: str) -> pd.DataFrame:
-    """Extracts data from a Google Earth file (KML or KMZ) and handles errors due to parsing issues"""
-    kml_code = extract_kml_code_from_file(file_path)
-    # Choose the extraction method based on the presence of SimpleData or SimpleField tags in the KML code
-    primary_func, fallback_func = (
-        (extract_data_from_ge_file, load_ge_file)
-        if any(tag in kml_code.lower() for tag in ("<simpledata", "<simplefield"))
-        else (load_ge_file, extract_data_from_ge_file)
-    )
-    try:
-        data_df = primary_func(file_path)
-    except (
-        pd.errors.ParserError,
-        lxml.etree.ParserError,
-        lxml.etree.XMLSyntaxError,
-        ValueError,
-    ):
-        data_df = fallback_func(file_path)
-    return data_df

geospatial-data-converter/utils.py CHANGED Viewed

@@ -6,9 +6,7 @@ from typing import BinaryIO
 import geopandas as gpd
-# from kml_tricks import read_ge_file
-# from kml_tricks_before import readge as read_ge_file
-from kml_tricks_refactor import load_ge_data as read_ge_file
 output_format_dict = {
     "ESRI Shapefile": ("shp", "zip", "application/zip"),  # must be zipped
@@ -39,7 +37,7 @@ def read_file(file: BinaryIO, *args, **kwargs) -> gpd.GeoDataFrame:
             tmp_file_path = os.path.join(tmp_dir, file.name)
             with open(tmp_file_path, "wb") as tmp_file:
                 tmp_file.write(file.read())
-            return read_ge_file(tmp_file_path)
     return gpd.read_file(file, *args, engine="pyogrio", **kwargs)

 import geopandas as gpd
+from kml_tricks import load_ge_data
 output_format_dict = {
     "ESRI Shapefile": ("shp", "zip", "application/zip"),  # must be zipped
             tmp_file_path = os.path.join(tmp_dir, file.name)
             with open(tmp_file_path, "wb") as tmp_file:
                 tmp_file.write(file.read())
+            return load_ge_data(tmp_file_path)
     return gpd.read_file(file, *args, engine="pyogrio", **kwargs)