satupeta-main/app/mapset_pipeline/core/processing/analyzer.py
2026-02-23 12:20:42 +07:00

182 lines
6.2 KiB
Python
Executable File

import os
import asyncio
import pandas as pd
import geopandas as gpd
from app.response.res import errorRes
from app.mapset_pipeline.utils.file_ops import generate_unique_filename, dataframe_validation
from app.mapset_pipeline.utils.formatters import safe_json
from .geometry_build import is_geom_empty, detect_and_build_geometry, attach_polygon_geometry_auto
from app.mapset_pipeline.core.clients.ai_client import generate_metadata
from app.mapset_pipeline.core.publication.publish_geoserver import publish_layer_to_geoserver
from app.mapset_pipeline.core.publication.publish_geonetwork import publish_metadata
async def analyze_and_clean_dataframe(df: pd.DataFrame, ext: str, filename: str, fileDesc: str):
"""
Fungsi utama untuk memproses DataFrame:
1. Deteksi Geometri
2. Validasi & Hitung Statistik
3. Generate Preview & Warnings
4. Generate Metadata (AI)
5. Simpan ke Temporary Parquet
"""
# 1. Deteksi Geometri
result = detect_and_build_geometry(df, master_polygons=None)
if not hasattr(result, "geometry") or result.geometry.isna().all():
result = attach_polygon_geometry_auto(result)
def normalize_geom_type(geom_type):
if geom_type and geom_type.startswith("Multi"):
return geom_type.replace("Multi", "")
return geom_type
# 2. Analisis Tipe Geometri
if isinstance(result, gpd.GeoDataFrame) and "geometry" in result.columns:
geom_types = (
result.geometry
.dropna()
.geom_type
.apply(normalize_geom_type)
.unique()
)
geom_type = geom_types[0] if len(geom_types) > 0 else "None"
null_geom = result.geometry.isna().sum()
print(f"[INFO] Tipe Geometry: {geom_type}")
print(f"[INFO] Jumlah geometry kosong: {null_geom}")
else:
# Fallback jika gagal mendeteksi geometry
res = {
"message": "Tidak menemukan tabel yang relevan atau kolom geometri.",
"file_type": ext,
"rows": len(df),
"columns": len(df.columns),
"geometry_valid": 0,
"geometry_empty": 0,
"geometry_valid_percent": 0,
"warnings": [],
"warning_examples": [],
"preview": []
}
# Kita raise error dictionary agar bisa ditangkap oleh router/service
# Atau return dictionary error structure
return errorRes(message="Tidak berhasil mencocokan geometry pada tabel.", details=res, status_code=422)
# 3. Cleaning Data Values
result = result.replace([pd.NA, float('inf'), float('-inf')], None)
# Convert Geometry ke WKT untuk analisis teks
if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns:
# Kita perlu simpan WKT string agar serializable saat preview
# Tapi biarkan geometry asli untuk proses parquet nanti
pass
# Hitung Statistik Validitas
empty_count = result['geometry'].apply(is_geom_empty).sum()
valid_count = len(result) - empty_count
match_percentage = (valid_count / len(result)) * 100
warnings = []
if empty_count > 0:
warnings.append(
f"{empty_count} dari {len(result)} baris tidak memiliki geometry yang valid "
f"({100 - match_percentage:.2f}% data gagal cocok)."
)
# Ambil contoh data error
if empty_count > 0:
examples = result[result['geometry'].apply(is_geom_empty)].head(500)
warning_examples = examples.to_dict(orient="records")
else:
warning_examples = []
# Prepare Preview Data (Convert WKT for JSON response)
# Kita copy agar tidak merusak dataframe utama
data_df = result.copy()
if 'geometry' in data_df.columns:
data_df['geometry'] = data_df['geometry'].apply(
lambda g: g.wkt if g is not None else None
)
preview_data = data_df.to_dict(orient="records")
# Sanitasi JSON (numpy types -> python types)
preview_safe = [
{k: safe_json(v) for k, v in row.items()} for row in preview_data
]
warning_safe = [
{k: safe_json(v) for k, v in row.items()} for row in warning_examples
]
# 4. AI Metadata Generation
ai_context = {
"nama_file_peta": filename,
"nama_opd": "Badan Penanggulangan Bencana Daerah (BPBD) Provinsi Jatim", # Sebaiknya dinamis
"tipe_data_spasial": geom_type,
"deskripsi_singkat": fileDesc,
"struktur_atribut_data": {},
}
try:
ai_suggest = generate_metadata(ai_context)
except Exception as e:
print(f"[WARNING] Gagal generate metadata AI: {e}")
ai_suggest = {}
# 5. Simpan ke Temporary Parquet
# Gunakan filename unik agar thread safe
tmp_file = generate_unique_filename(folder="tmp", ext="parquet")
# Proses konversi synchronous dijalankan di thread terpisah agar tidak blocking
print('start')
await asyncio.to_thread(dataframe_validation, data_df, tmp_file)
print('pass')
response = {
"message": "File berhasil dibaca dan dianalisis.",
"file_name": filename,
"file_type": ext,
"rows": int(len(result)),
"columns": list(map(str, result.columns)),
"geometry_valid": int(valid_count),
"geometry_empty": int(empty_count),
"geometry_valid_percent": float(round(match_percentage, 2)),
"geometry_type": geom_type,
"warnings": warnings,
"warning_rows": warning_safe,
"preview": preview_safe,
"metadata_suggest": ai_suggest,
"tmp_path": tmp_file
}
return response
async def publish_mapset(table_name: str, job_id: str):
try:
geos_link = publish_layer_to_geoserver(table_name, job_id)
uuid = await publish_metadata(
table_name=table_name,
geoserver_links=geos_link
)
# await update_job_status(table_name, "FINISHED", job_id)
# return uuid
return {
"geos_link": geos_link["layer_url"],
# "uuid": uuid
"uuid": "123123"
}
except Exception as e:
# await update_job_status(table_name, "FAILED", job_id)
raise RuntimeError(f"Publish layer gagal: {e}") from e