satupeta-main/app/mapset_pipeline/service.py
2026-02-23 12:20:42 +07:00

287 lines
10 KiB
Python
Executable File

import os
import shutil
import pandas as pd
from fastapi import UploadFile, HTTPException
from typing import Optional
# --- Internal Modules ---
from .api.schemas import UploadRequest, PdfRequest
from .core.processing.analyzer import analyze_and_clean_dataframe, publish_mapset
from .core.readers import (
read_csv,
read_shp,
read_gdb,
read_mpk,
read_pdf,
convert_df
)
from .data.repository import (
generate_unique_table_name,
insert_parquet_to_postgis,
save_author_metadata,
call_cleansing_procedure
)
from app.mapset_pipeline.utils.file_ops import (
detect_zip_type,
generate_job_id,
)
from app.mapset_pipeline.utils.formatters import (
save_xml_to_sld,
)
# --- Legacy/External Modules (Sesuai kode asli Anda) ---
from app.core.config import UPLOAD_FOLDER, MAX_FILE_MB, GEONETWORK_URL
from utils.logger_config import log_activity
# from api.routers.datasets_router import (
# upload_to_main
# )
async def handle_file_analysis(
file: UploadFile,
page: Optional[str] = "",
sheet: Optional[str] = "",
fileDesc: Optional[str] = ""
):
"""
Orchestrator untuk endpoint /upload.
1. Simpan file fisik.
2. Pilih Reader berdasarkan ekstensi.
3. Panggil Processor untuk analisis.
4. Bersihkan file fisik.
"""
fname = file.filename
ext = os.path.splitext(fname)[1].lower()
# 1. Validasi & Simpan File
# Membaca file in-memory untuk cek ukuran (hati-hati memory usage untuk file besar)
contents = await file.read()
size_mb = len(contents) / (1024 * 1024)
if size_mb > MAX_FILE_MB:
raise HTTPException(status_code=413, detail="Ukuran File Terlalu Besar")
tmp_path = UPLOAD_FOLDER / fname
# Pastikan folder ada
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
with open(tmp_path, "wb") as f:
f.write(contents)
df = None
try:
# 2. Routing Reader Berdasarkan Ekstensi
print(f"[INFO] Processing file type: {ext}")
if ext == ".csv":
df = read_csv(str(tmp_path))
elif ext == ".xlsx":
df = read_csv(str(tmp_path), sheet) # Asumsi read_csv handle xlsx juga sesuai kode asli
elif ext == ".mpk":
df = read_mpk(str(tmp_path))
elif ext == ".pdf":
# Logic PDF agak unik, bisa return list tabel atau df
tbl = read_pdf(tmp_path, page)
if len(tbl) == 0:
return {
"message": "Tidak ditemukan tabel valid pada halaman yang dipilih",
"tables": {},
"file_type": ext
}
elif len(tbl) > 1:
return {
"message": "File berhasil dibaca, ditemukan banyak tabel.",
"tables": tbl,
"file_type": ext
}
else:
df = convert_df(tbl[0])
elif ext == ".zip":
zip_type = detect_zip_type(str(tmp_path))
if zip_type == "shp":
df = read_shp(str(tmp_path))
elif zip_type == "gdb":
df = read_gdb(str(tmp_path))
else:
raise HTTPException(status_code=400, detail="ZIP file tidak mengandung SHP / GDB valid.")
else:
raise HTTPException(status_code=400, detail="Unsupported file type")
# Cek Dataframe Kosong
if df is None or (hasattr(df, "empty") and df.empty):
raise HTTPException(status_code=422, detail="File berhasil dibaca, tetapi tidak ditemukan tabel valid")
# 3. Panggil Processor (Logic Cleaning & Validasi)
result_analysis = await analyze_and_clean_dataframe(df, ext, fname, fileDesc)
return result_analysis
except Exception as e:
print(f"[ERROR] handle_file_analysis: {e}")
raise HTTPException(status_code=500, detail=str(e))
finally:
# 4. Cleanup Uploaded File (Raw File)
# Kita hapus file upload asli, tapi file temp parquet (hasil processor)
# tetap hidup sampai frontend mengirim request ingest
if tmp_path.exists():
try:
os.remove(tmp_path)
except Exception:
pass
async def process_pdf_file(payload: PdfRequest):
"""
Helper khusus jika user mengupload PDF dan ingin memilih tabel tertentu.
"""
try:
# Convert request body ke DataFrame (sesuai logic reader_pdf)
# Kita mock convert_df karena di kode asli import dari reader_pdf
# yang mungkin mengharapkan format dict khusus
df = convert_df(payload.model_dump())
if df is None or (hasattr(df, "empty") and df.empty):
raise HTTPException(status_code=422, detail="Tidak ada tabel valid dalam PDF")
# Reuse logic processor yang sama
return await analyze_and_clean_dataframe(
df, '.pdf', payload.fileName, payload.fileDesc
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
async def execute_postgis_ingestion(payload: UploadRequest, user_id: int):
"""
Orchestrator untuk endpoint /process-to-postgis.
1. Terima data (JSON rows).
2. Convert ke Parquet Temporary.
3. Upload ke PostGIS (via Repository).
4. Simpan Metadata (via Repository).
5. Trigger Cleansing & Publishing.
6. Logging.
"""
job_id = generate_job_id(str(user_id))
try:
# 1. Generate Nama Tabel
table_name = await generate_unique_table_name(payload.title)
# 2. Persiapan Data (JSON -> DataFrame -> Parquet)
# Kita perlu save ke parquet karena repository insert_parquet_to_postgis membaca file
# Ini juga memisahkan memory load antara API dan DB Process
df = pd.DataFrame(payload.rows)
# Upper case columns
df.columns = [col.upper() for col in df.columns]
# Rename Geometry jika perlu (standarisasi input dari frontend)
if "GEOMETRY" in df.columns:
df.rename(columns={"GEOMETRY": "GEOM"}, inplace=True)
# Simpan ke file temp untuk diproses repository
temp_parquet_name = f"{job_id}.parquet"
temp_parquet_path = os.path.join("tmp", temp_parquet_name)
os.makedirs("tmp", exist_ok=True)
# Save parquet (gunakan engine pyarrow atau fastparquet)
df.to_parquet(temp_parquet_path, index=False)
# 3. Insert ke PostGIS
# Fungsi ini akan membaca file parquet tadi, membersihkan geom, dan copy ke DB
db_result = await insert_parquet_to_postgis(temp_parquet_name, table_name)
# 4. Simpan Metadata
# Ambil list geom type dan row count dari hasil insert DB (lebih akurat)
final_geom_types = [db_result['geom_type']] # Disederhanakan jadi list
row_count = db_result['row_count']
await save_author_metadata(
payload_author=payload.author,
table_name=table_name,
dataset_title=payload.title,
geom_types=final_geom_types,
row_count=row_count,
user_id=user_id
)
# 5. Logging Activity
await log_activity(
user_id=user_id,
action_type="UPLOAD",
action_title=f"Upload dataset {table_name}",
details={"table_name": table_name, "rows": row_count}
)
# 6. Post-Processing (External APIs)
result = {
"job_id": job_id,
"job_status": "wait",
"table_name": table_name,
"status": "success",
"message": f"Tabel '{table_name}' berhasil dibuat.",
"total_rows": row_count,
"geometry_type": final_geom_types,
"crs": payload.author.get("crs", "EPSG:4326"),
"metadata_uuid": ""
}
# Save Style (SLD)
save_xml_to_sld(payload.style, job_id)
# CLEANSING WITH QUERY
try:
cleansing_status = await call_cleansing_procedure(table_name)
except Exception as e:
cleansing_status = "failed"
print(f"Cleansing warning: {e}")
result['job_status'] = cleansing_status
# Publish Layer (Geoserver/Geonetwork)
publish_info = await publish_mapset(table_name, job_id)
result['metadata_uuid'] = publish_info.get('uuid', '')
# 7. Upload to Main Portal (Mapset Integration)
mapset_payload = {
"name": payload.title,
"description": payload.author.get("abstract"),
"scale": "1:25000",
# ID Hardcoded sesuai kode asli (pertimbangkan pindah ke config/env)
'projection_system_id': '0196c746-d1ba-7f1c-9706-5df738679cc7',
"category_id": payload.author.get("mapsetCategory"),
"data_status": "sementara",
'classification_id': '01968b4b-d3f9-76c9-888c-ee887ac31ce4',
'producer_id': '01968b54-0000-7a67-bd10-975b8923b93e',
"layer_type": final_geom_types[0],
'source_id': ['019c03ef-35e1-738b-858d-871dc7d1e4d6'],
"layer_url": publish_info.get('geos_link', ''),
"metadata_url": f"{GEONETWORK_URL}/srv/eng/catalog.search#/metadata/{publish_info.get('uuid', '')}",
"coverage_level": "provinsi",
"coverage_area": "kabupaten",
"data_update_period": "Tahunan",
"data_version": "2026",
"is_popular": False,
"is_active": True,
'regional_id': '01968b53-a910-7a67-bd10-975b8923b92e',
"notes": "Mapset baru dibuat",
"status_validation": "on_verification",
}
# await upload_to_main(mapset_payload)
return result
except Exception as e:
# Error Handling & Logging
await log_activity(
user_id=user_id,
action_type="ERROR",
action_title="Upload gagal",
details={"error": str(e)}
)
print(f"[ERROR] execute_postgis_ingestion: {e}")
# Re-raise sebagai HTTP Exception agar router mengembalikan 500 yang rapi
raise HTTPException(status_code=500, detail=str(e))