182 lines
6.2 KiB
Python
Executable File
182 lines
6.2 KiB
Python
Executable File
import os
|
|
import asyncio
|
|
import pandas as pd
|
|
import geopandas as gpd
|
|
from app.response.res import errorRes
|
|
|
|
from app.mapset_pipeline.utils.file_ops import generate_unique_filename, dataframe_validation
|
|
from app.mapset_pipeline.utils.formatters import safe_json
|
|
from .geometry_build import is_geom_empty, detect_and_build_geometry, attach_polygon_geometry_auto
|
|
from app.mapset_pipeline.core.clients.ai_client import generate_metadata
|
|
from app.mapset_pipeline.core.publication.publish_geoserver import publish_layer_to_geoserver
|
|
from app.mapset_pipeline.core.publication.publish_geonetwork import publish_metadata
|
|
|
|
async def analyze_and_clean_dataframe(df: pd.DataFrame, ext: str, filename: str, fileDesc: str):
|
|
"""
|
|
Fungsi utama untuk memproses DataFrame:
|
|
1. Deteksi Geometri
|
|
2. Validasi & Hitung Statistik
|
|
3. Generate Preview & Warnings
|
|
4. Generate Metadata (AI)
|
|
5. Simpan ke Temporary Parquet
|
|
"""
|
|
|
|
# 1. Deteksi Geometri
|
|
result = detect_and_build_geometry(df, master_polygons=None)
|
|
|
|
if not hasattr(result, "geometry") or result.geometry.isna().all():
|
|
result = attach_polygon_geometry_auto(result)
|
|
|
|
def normalize_geom_type(geom_type):
|
|
if geom_type and geom_type.startswith("Multi"):
|
|
return geom_type.replace("Multi", "")
|
|
return geom_type
|
|
|
|
# 2. Analisis Tipe Geometri
|
|
if isinstance(result, gpd.GeoDataFrame) and "geometry" in result.columns:
|
|
geom_types = (
|
|
result.geometry
|
|
.dropna()
|
|
.geom_type
|
|
.apply(normalize_geom_type)
|
|
.unique()
|
|
)
|
|
geom_type = geom_types[0] if len(geom_types) > 0 else "None"
|
|
null_geom = result.geometry.isna().sum()
|
|
|
|
print(f"[INFO] Tipe Geometry: {geom_type}")
|
|
print(f"[INFO] Jumlah geometry kosong: {null_geom}")
|
|
else:
|
|
# Fallback jika gagal mendeteksi geometry
|
|
res = {
|
|
"message": "Tidak menemukan tabel yang relevan atau kolom geometri.",
|
|
"file_type": ext,
|
|
"rows": len(df),
|
|
"columns": len(df.columns),
|
|
"geometry_valid": 0,
|
|
"geometry_empty": 0,
|
|
"geometry_valid_percent": 0,
|
|
"warnings": [],
|
|
"warning_examples": [],
|
|
"preview": []
|
|
}
|
|
# Kita raise error dictionary agar bisa ditangkap oleh router/service
|
|
# Atau return dictionary error structure
|
|
return errorRes(message="Tidak berhasil mencocokan geometry pada tabel.", details=res, status_code=422)
|
|
|
|
# 3. Cleaning Data Values
|
|
result = result.replace([pd.NA, float('inf'), float('-inf')], None)
|
|
|
|
# Convert Geometry ke WKT untuk analisis teks
|
|
if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns:
|
|
# Kita perlu simpan WKT string agar serializable saat preview
|
|
# Tapi biarkan geometry asli untuk proses parquet nanti
|
|
pass
|
|
|
|
# Hitung Statistik Validitas
|
|
empty_count = result['geometry'].apply(is_geom_empty).sum()
|
|
valid_count = len(result) - empty_count
|
|
match_percentage = (valid_count / len(result)) * 100
|
|
|
|
warnings = []
|
|
if empty_count > 0:
|
|
warnings.append(
|
|
f"{empty_count} dari {len(result)} baris tidak memiliki geometry yang valid "
|
|
f"({100 - match_percentage:.2f}% data gagal cocok)."
|
|
)
|
|
|
|
# Ambil contoh data error
|
|
if empty_count > 0:
|
|
examples = result[result['geometry'].apply(is_geom_empty)].head(500)
|
|
warning_examples = examples.to_dict(orient="records")
|
|
else:
|
|
warning_examples = []
|
|
|
|
# Prepare Preview Data (Convert WKT for JSON response)
|
|
# Kita copy agar tidak merusak dataframe utama
|
|
data_df = result.copy()
|
|
if 'geometry' in data_df.columns:
|
|
data_df['geometry'] = data_df['geometry'].apply(
|
|
lambda g: g.wkt if g is not None else None
|
|
)
|
|
|
|
preview_data = data_df.to_dict(orient="records")
|
|
|
|
# Sanitasi JSON (numpy types -> python types)
|
|
preview_safe = [
|
|
{k: safe_json(v) for k, v in row.items()} for row in preview_data
|
|
]
|
|
|
|
warning_safe = [
|
|
{k: safe_json(v) for k, v in row.items()} for row in warning_examples
|
|
]
|
|
|
|
# 4. AI Metadata Generation
|
|
ai_context = {
|
|
"nama_file_peta": filename,
|
|
"nama_opd": "Badan Penanggulangan Bencana Daerah (BPBD) Provinsi Jatim", # Sebaiknya dinamis
|
|
"tipe_data_spasial": geom_type,
|
|
"deskripsi_singkat": fileDesc,
|
|
"struktur_atribut_data": {},
|
|
}
|
|
|
|
try:
|
|
ai_suggest = generate_metadata(ai_context)
|
|
except Exception as e:
|
|
print(f"[WARNING] Gagal generate metadata AI: {e}")
|
|
ai_suggest = {}
|
|
|
|
# 5. Simpan ke Temporary Parquet
|
|
# Gunakan filename unik agar thread safe
|
|
tmp_file = generate_unique_filename(folder="tmp", ext="parquet")
|
|
|
|
# Proses konversi synchronous dijalankan di thread terpisah agar tidak blocking
|
|
print('start')
|
|
await asyncio.to_thread(dataframe_validation, data_df, tmp_file)
|
|
print('pass')
|
|
|
|
response = {
|
|
"message": "File berhasil dibaca dan dianalisis.",
|
|
"file_name": filename,
|
|
"file_type": ext,
|
|
"rows": int(len(result)),
|
|
"columns": list(map(str, result.columns)),
|
|
"geometry_valid": int(valid_count),
|
|
"geometry_empty": int(empty_count),
|
|
"geometry_valid_percent": float(round(match_percentage, 2)),
|
|
"geometry_type": geom_type,
|
|
"warnings": warnings,
|
|
"warning_rows": warning_safe,
|
|
"preview": preview_safe,
|
|
"metadata_suggest": ai_suggest,
|
|
"tmp_path": tmp_file
|
|
}
|
|
|
|
return response
|
|
|
|
|
|
async def publish_mapset(table_name: str, job_id: str):
|
|
try:
|
|
|
|
geos_link = publish_layer_to_geoserver(table_name, job_id)
|
|
|
|
uuid = await publish_metadata(
|
|
table_name=table_name,
|
|
geoserver_links=geos_link
|
|
)
|
|
|
|
# await update_job_status(table_name, "FINISHED", job_id)
|
|
|
|
# return uuid
|
|
return {
|
|
"geos_link": geos_link["layer_url"],
|
|
# "uuid": uuid
|
|
"uuid": "123123"
|
|
}
|
|
|
|
except Exception as e:
|
|
# await update_job_status(table_name, "FAILED", job_id)
|
|
raise RuntimeError(f"Publish layer gagal: {e}") from e
|
|
|
|
|