satupeta-main/app/mapset_pipeline/utils/file_ops.py

106 lines
3.6 KiB
Python
Raw Normal View History

2026-02-23 05:20:42 +00:00
import os
import uuid
import zipfile
import geopandas as gpd
from shapely import wkt
from shapely.errors import ShapelyError
from datetime import datetime
def detect_zip_type(zip_path: str) -> str:
with zipfile.ZipFile(zip_path, "r") as zip_ref:
files = zip_ref.namelist()
if any(f.lower().endswith(".gdb/") or ".gdb/" in f.lower() for f in files):
return "gdb"
if any(f.lower().endswith(ext) for ext in [".gdbtable", ".gdbtablx", ".gdbindexes", ".spx"] for f in files):
return "gdb"
if any(f.lower().endswith(".shp") for f in files):
return "shp"
return "unknown"
def generate_unique_filename(folder="tmp", ext="parquet", digits=6):
os.makedirs(folder, exist_ok=True)
while True:
file_id = file_id = uuid.uuid4().int
filename = f"{folder}/{file_id}.{ext}"
if not os.path.exists(filename):
return filename
def generate_job_id(user_id: str) -> str:
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
return f"{user_id}_{timestamp}"
def dataframe_validation(df_input, tmp_file):
"""
Fungsi ini berjalan di thread terpisah (CPU bound).
Melakukan validasi, cleaning, dan export ke parquet.
"""
# 1. Copy agar tidak mengubah data asli
export_df = df_input.copy()
# =========================================================================
# TAHAP 1: SAFE WKT LOADING
# =========================================================================
def safe_load_wkt(raw):
if not isinstance(raw, str):
return None
try:
return wkt.loads(raw)
# 2. GANTI CATCH BLOCK INI
# except (WKTReadingError, Exception): <-- LAMA
except (ShapelyError, Exception):
return None
# Terapkan safe load
export_df["geom"] = export_df["geometry"].apply(safe_load_wkt)
# =========================================================================
# TAHAP 2: FILTER NULL & INVALID GEOMETRY
# =========================================================================
# Hapus baris di mana konversi WKT gagal (None)
export_df = export_df[export_df["geom"].notnull()]
print("df", export_df)
if export_df.empty:
raise ValueError("Tidak ada data spasial valid yang ditemukan.")
# Jadikan GeoDataFrame
export_df = gpd.GeoDataFrame(export_df, geometry="geom")
# =========================================================================
# TAHAP 3: FIX TOPOLOGY (PENTING!)
# =========================================================================
# Cek validitas (misal: Polygon yang garisnya menabrak diri sendiri)
# buffer(0) adalah trik standar GIS untuk memperbaiki topologi ringan
export_df["geom"] = export_df["geom"].apply(
lambda g: g.buffer(0) if not g.is_valid else g
)
# Hapus lagi jika setelah di-fix malah jadi kosong (jarang terjadi, tapi aman)
export_df = export_df[~export_df["geom"].is_empty]
# =========================================================================
# TAHAP 4: FINALISASI (CRS & RENAME)
# =========================================================================
export_df = export_df.drop(columns=["geometry"]) # Buang kolom string WKT lama
export_df = export_df.set_crs("EPSG:4326", allow_override=True)
# Rename kolom atribut ke UPPERCASE, biarkan 'geom' lowercase
# .strip() untuk membuang spasi hantu (" ID " -> "ID")
export_df = export_df.rename(
columns=lambda c: str(c).strip().upper() if c != "geom" else c
)
# Simpan ke Parquet
export_df.to_parquet(tmp_file)
return len(export_df)