import os import asyncio import pandas as pd import geopandas as gpd from app.response.res import errorRes from app.mapset_pipeline.utils.file_ops import generate_unique_filename, dataframe_validation from app.mapset_pipeline.utils.formatters import safe_json from .geometry_build import is_geom_empty, detect_and_build_geometry, attach_polygon_geometry_auto from app.mapset_pipeline.core.clients.ai_client import generate_metadata from app.mapset_pipeline.core.publication.publish_geoserver import publish_layer_to_geoserver from app.mapset_pipeline.core.publication.publish_geonetwork import publish_metadata async def analyze_and_clean_dataframe(df: pd.DataFrame, ext: str, filename: str, fileDesc: str): """ Fungsi utama untuk memproses DataFrame: 1. Deteksi Geometri 2. Validasi & Hitung Statistik 3. Generate Preview & Warnings 4. Generate Metadata (AI) 5. Simpan ke Temporary Parquet """ # 1. Deteksi Geometri result = detect_and_build_geometry(df, master_polygons=None) if not hasattr(result, "geometry") or result.geometry.isna().all(): result = attach_polygon_geometry_auto(result) def normalize_geom_type(geom_type): if geom_type and geom_type.startswith("Multi"): return geom_type.replace("Multi", "") return geom_type # 2. Analisis Tipe Geometri if isinstance(result, gpd.GeoDataFrame) and "geometry" in result.columns: geom_types = ( result.geometry .dropna() .geom_type .apply(normalize_geom_type) .unique() ) geom_type = geom_types[0] if len(geom_types) > 0 else "None" null_geom = result.geometry.isna().sum() print(f"[INFO] Tipe Geometry: {geom_type}") print(f"[INFO] Jumlah geometry kosong: {null_geom}") else: # Fallback jika gagal mendeteksi geometry res = { "message": "Tidak menemukan tabel yang relevan atau kolom geometri.", "file_type": ext, "rows": len(df), "columns": len(df.columns), "geometry_valid": 0, "geometry_empty": 0, "geometry_valid_percent": 0, "warnings": [], "warning_examples": [], "preview": [] } # Kita raise error dictionary agar bisa ditangkap oleh router/service # Atau return dictionary error structure return errorRes(message="Tidak berhasil mencocokan geometry pada tabel.", details=res, status_code=422) # 3. Cleaning Data Values result = result.replace([pd.NA, float('inf'), float('-inf')], None) # Convert Geometry ke WKT untuk analisis teks if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns: # Kita perlu simpan WKT string agar serializable saat preview # Tapi biarkan geometry asli untuk proses parquet nanti pass # Hitung Statistik Validitas empty_count = result['geometry'].apply(is_geom_empty).sum() valid_count = len(result) - empty_count match_percentage = (valid_count / len(result)) * 100 warnings = [] if empty_count > 0: warnings.append( f"{empty_count} dari {len(result)} baris tidak memiliki geometry yang valid " f"({100 - match_percentage:.2f}% data gagal cocok)." ) # Ambil contoh data error if empty_count > 0: examples = result[result['geometry'].apply(is_geom_empty)].head(500) warning_examples = examples.to_dict(orient="records") else: warning_examples = [] # Prepare Preview Data (Convert WKT for JSON response) # Kita copy agar tidak merusak dataframe utama data_df = result.copy() if 'geometry' in data_df.columns: data_df['geometry'] = data_df['geometry'].apply( lambda g: g.wkt if g is not None else None ) preview_data = data_df.to_dict(orient="records") # Sanitasi JSON (numpy types -> python types) preview_safe = [ {k: safe_json(v) for k, v in row.items()} for row in preview_data ] warning_safe = [ {k: safe_json(v) for k, v in row.items()} for row in warning_examples ] # 4. AI Metadata Generation ai_context = { "nama_file_peta": filename, "nama_opd": "Badan Penanggulangan Bencana Daerah (BPBD) Provinsi Jatim", # Sebaiknya dinamis "tipe_data_spasial": geom_type, "deskripsi_singkat": fileDesc, "struktur_atribut_data": {}, } try: ai_suggest = generate_metadata(ai_context) except Exception as e: print(f"[WARNING] Gagal generate metadata AI: {e}") ai_suggest = {} # 5. Simpan ke Temporary Parquet # Gunakan filename unik agar thread safe tmp_file = generate_unique_filename(folder="tmp", ext="parquet") # Proses konversi synchronous dijalankan di thread terpisah agar tidak blocking print('start') await asyncio.to_thread(dataframe_validation, data_df, tmp_file) print('pass') response = { "message": "File berhasil dibaca dan dianalisis.", "file_name": filename, "file_type": ext, "rows": int(len(result)), "columns": list(map(str, result.columns)), "geometry_valid": int(valid_count), "geometry_empty": int(empty_count), "geometry_valid_percent": float(round(match_percentage, 2)), "geometry_type": geom_type, "warnings": warnings, "warning_rows": warning_safe, "preview": preview_safe, "metadata_suggest": ai_suggest, "tmp_path": tmp_file } return response async def publish_mapset(table_name: str, job_id: str): try: geos_link = publish_layer_to_geoserver(table_name, job_id) uuid = await publish_metadata( table_name=table_name, geoserver_links=geos_link ) # await update_job_status(table_name, "FINISHED", job_id) # return uuid return { "geos_link": geos_link["layer_url"], # "uuid": uuid "uuid": "123123" } except Exception as e: # await update_job_status(table_name, "FAILED", job_id) raise RuntimeError(f"Publish layer gagal: {e}") from e