satupeta-main/app/mapset_pipeline/utils/file_ops.py
2026-02-23 12:20:42 +07:00

106 lines
3.6 KiB
Python
Executable File

import os
import uuid
import zipfile
import geopandas as gpd
from shapely import wkt
from shapely.errors import ShapelyError
from datetime import datetime
def detect_zip_type(zip_path: str) -> str:
with zipfile.ZipFile(zip_path, "r") as zip_ref:
files = zip_ref.namelist()
if any(f.lower().endswith(".gdb/") or ".gdb/" in f.lower() for f in files):
return "gdb"
if any(f.lower().endswith(ext) for ext in [".gdbtable", ".gdbtablx", ".gdbindexes", ".spx"] for f in files):
return "gdb"
if any(f.lower().endswith(".shp") for f in files):
return "shp"
return "unknown"
def generate_unique_filename(folder="tmp", ext="parquet", digits=6):
os.makedirs(folder, exist_ok=True)
while True:
file_id = file_id = uuid.uuid4().int
filename = f"{folder}/{file_id}.{ext}"
if not os.path.exists(filename):
return filename
def generate_job_id(user_id: str) -> str:
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
return f"{user_id}_{timestamp}"
def dataframe_validation(df_input, tmp_file):
"""
Fungsi ini berjalan di thread terpisah (CPU bound).
Melakukan validasi, cleaning, dan export ke parquet.
"""
# 1. Copy agar tidak mengubah data asli
export_df = df_input.copy()
# =========================================================================
# TAHAP 1: SAFE WKT LOADING
# =========================================================================
def safe_load_wkt(raw):
if not isinstance(raw, str):
return None
try:
return wkt.loads(raw)
# 2. GANTI CATCH BLOCK INI
# except (WKTReadingError, Exception): <-- LAMA
except (ShapelyError, Exception):
return None
# Terapkan safe load
export_df["geom"] = export_df["geometry"].apply(safe_load_wkt)
# =========================================================================
# TAHAP 2: FILTER NULL & INVALID GEOMETRY
# =========================================================================
# Hapus baris di mana konversi WKT gagal (None)
export_df = export_df[export_df["geom"].notnull()]
print("df", export_df)
if export_df.empty:
raise ValueError("Tidak ada data spasial valid yang ditemukan.")
# Jadikan GeoDataFrame
export_df = gpd.GeoDataFrame(export_df, geometry="geom")
# =========================================================================
# TAHAP 3: FIX TOPOLOGY (PENTING!)
# =========================================================================
# Cek validitas (misal: Polygon yang garisnya menabrak diri sendiri)
# buffer(0) adalah trik standar GIS untuk memperbaiki topologi ringan
export_df["geom"] = export_df["geom"].apply(
lambda g: g.buffer(0) if not g.is_valid else g
)
# Hapus lagi jika setelah di-fix malah jadi kosong (jarang terjadi, tapi aman)
export_df = export_df[~export_df["geom"].is_empty]
# =========================================================================
# TAHAP 4: FINALISASI (CRS & RENAME)
# =========================================================================
export_df = export_df.drop(columns=["geometry"]) # Buang kolom string WKT lama
export_df = export_df.set_crs("EPSG:4326", allow_override=True)
# Rename kolom atribut ke UPPERCASE, biarkan 'geom' lowercase
# .strip() untuk membuang spasi hantu (" ID " -> "ID")
export_df = export_df.rename(
columns=lambda c: str(c).strip().upper() if c != "geom" else c
)
# Simpan ke Parquet
export_df.to_parquet(tmp_file)
return len(export_df)