287 lines
10 KiB
Python
Executable File
287 lines
10 KiB
Python
Executable File
import os
|
|
import shutil
|
|
import pandas as pd
|
|
from fastapi import UploadFile, HTTPException
|
|
from typing import Optional
|
|
|
|
# --- Internal Modules ---
|
|
from .api.schemas import UploadRequest, PdfRequest
|
|
from .core.processing.analyzer import analyze_and_clean_dataframe, publish_mapset
|
|
from .core.readers import (
|
|
read_csv,
|
|
read_shp,
|
|
read_gdb,
|
|
read_mpk,
|
|
read_pdf,
|
|
convert_df
|
|
)
|
|
from .data.repository import (
|
|
generate_unique_table_name,
|
|
insert_parquet_to_postgis,
|
|
save_author_metadata,
|
|
call_cleansing_procedure
|
|
)
|
|
|
|
from app.mapset_pipeline.utils.file_ops import (
|
|
detect_zip_type,
|
|
generate_job_id,
|
|
)
|
|
from app.mapset_pipeline.utils.formatters import (
|
|
save_xml_to_sld,
|
|
)
|
|
|
|
# --- Legacy/External Modules (Sesuai kode asli Anda) ---
|
|
from app.core.config import UPLOAD_FOLDER, MAX_FILE_MB, GEONETWORK_URL
|
|
from utils.logger_config import log_activity
|
|
|
|
# from api.routers.datasets_router import (
|
|
# upload_to_main
|
|
# )
|
|
|
|
async def handle_file_analysis(
|
|
file: UploadFile,
|
|
page: Optional[str] = "",
|
|
sheet: Optional[str] = "",
|
|
fileDesc: Optional[str] = ""
|
|
):
|
|
"""
|
|
Orchestrator untuk endpoint /upload.
|
|
1. Simpan file fisik.
|
|
2. Pilih Reader berdasarkan ekstensi.
|
|
3. Panggil Processor untuk analisis.
|
|
4. Bersihkan file fisik.
|
|
"""
|
|
fname = file.filename
|
|
ext = os.path.splitext(fname)[1].lower()
|
|
|
|
# 1. Validasi & Simpan File
|
|
# Membaca file in-memory untuk cek ukuran (hati-hati memory usage untuk file besar)
|
|
contents = await file.read()
|
|
size_mb = len(contents) / (1024 * 1024)
|
|
if size_mb > MAX_FILE_MB:
|
|
raise HTTPException(status_code=413, detail="Ukuran File Terlalu Besar")
|
|
|
|
tmp_path = UPLOAD_FOLDER / fname
|
|
# Pastikan folder ada
|
|
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
|
|
|
with open(tmp_path, "wb") as f:
|
|
f.write(contents)
|
|
|
|
df = None
|
|
try:
|
|
# 2. Routing Reader Berdasarkan Ekstensi
|
|
print(f"[INFO] Processing file type: {ext}")
|
|
|
|
if ext == ".csv":
|
|
df = read_csv(str(tmp_path))
|
|
elif ext == ".xlsx":
|
|
df = read_csv(str(tmp_path), sheet) # Asumsi read_csv handle xlsx juga sesuai kode asli
|
|
elif ext == ".mpk":
|
|
df = read_mpk(str(tmp_path))
|
|
elif ext == ".pdf":
|
|
# Logic PDF agak unik, bisa return list tabel atau df
|
|
tbl = read_pdf(tmp_path, page)
|
|
if len(tbl) == 0:
|
|
return {
|
|
"message": "Tidak ditemukan tabel valid pada halaman yang dipilih",
|
|
"tables": {},
|
|
"file_type": ext
|
|
}
|
|
elif len(tbl) > 1:
|
|
return {
|
|
"message": "File berhasil dibaca, ditemukan banyak tabel.",
|
|
"tables": tbl,
|
|
"file_type": ext
|
|
}
|
|
else:
|
|
df = convert_df(tbl[0])
|
|
elif ext == ".zip":
|
|
zip_type = detect_zip_type(str(tmp_path))
|
|
if zip_type == "shp":
|
|
df = read_shp(str(tmp_path))
|
|
elif zip_type == "gdb":
|
|
df = read_gdb(str(tmp_path))
|
|
else:
|
|
raise HTTPException(status_code=400, detail="ZIP file tidak mengandung SHP / GDB valid.")
|
|
else:
|
|
raise HTTPException(status_code=400, detail="Unsupported file type")
|
|
|
|
# Cek Dataframe Kosong
|
|
if df is None or (hasattr(df, "empty") and df.empty):
|
|
raise HTTPException(status_code=422, detail="File berhasil dibaca, tetapi tidak ditemukan tabel valid")
|
|
|
|
# 3. Panggil Processor (Logic Cleaning & Validasi)
|
|
result_analysis = await analyze_and_clean_dataframe(df, ext, fname, fileDesc)
|
|
return result_analysis
|
|
|
|
except Exception as e:
|
|
print(f"[ERROR] handle_file_analysis: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
finally:
|
|
# 4. Cleanup Uploaded File (Raw File)
|
|
# Kita hapus file upload asli, tapi file temp parquet (hasil processor)
|
|
# tetap hidup sampai frontend mengirim request ingest
|
|
if tmp_path.exists():
|
|
try:
|
|
os.remove(tmp_path)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
async def process_pdf_file(payload: PdfRequest):
|
|
"""
|
|
Helper khusus jika user mengupload PDF dan ingin memilih tabel tertentu.
|
|
"""
|
|
try:
|
|
# Convert request body ke DataFrame (sesuai logic reader_pdf)
|
|
# Kita mock convert_df karena di kode asli import dari reader_pdf
|
|
# yang mungkin mengharapkan format dict khusus
|
|
df = convert_df(payload.model_dump())
|
|
|
|
if df is None or (hasattr(df, "empty") and df.empty):
|
|
raise HTTPException(status_code=422, detail="Tidak ada tabel valid dalam PDF")
|
|
|
|
# Reuse logic processor yang sama
|
|
return await analyze_and_clean_dataframe(
|
|
df, '.pdf', payload.fileName, payload.fileDesc
|
|
)
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
async def execute_postgis_ingestion(payload: UploadRequest, user_id: int):
|
|
"""
|
|
Orchestrator untuk endpoint /process-to-postgis.
|
|
1. Terima data (JSON rows).
|
|
2. Convert ke Parquet Temporary.
|
|
3. Upload ke PostGIS (via Repository).
|
|
4. Simpan Metadata (via Repository).
|
|
5. Trigger Cleansing & Publishing.
|
|
6. Logging.
|
|
"""
|
|
job_id = generate_job_id(str(user_id))
|
|
|
|
try:
|
|
# 1. Generate Nama Tabel
|
|
table_name = await generate_unique_table_name(payload.title)
|
|
|
|
# 2. Persiapan Data (JSON -> DataFrame -> Parquet)
|
|
# Kita perlu save ke parquet karena repository insert_parquet_to_postgis membaca file
|
|
# Ini juga memisahkan memory load antara API dan DB Process
|
|
df = pd.DataFrame(payload.rows)
|
|
|
|
# Upper case columns
|
|
df.columns = [col.upper() for col in df.columns]
|
|
|
|
# Rename Geometry jika perlu (standarisasi input dari frontend)
|
|
if "GEOMETRY" in df.columns:
|
|
df.rename(columns={"GEOMETRY": "GEOM"}, inplace=True)
|
|
|
|
# Simpan ke file temp untuk diproses repository
|
|
temp_parquet_name = f"{job_id}.parquet"
|
|
temp_parquet_path = os.path.join("tmp", temp_parquet_name)
|
|
os.makedirs("tmp", exist_ok=True)
|
|
|
|
# Save parquet (gunakan engine pyarrow atau fastparquet)
|
|
df.to_parquet(temp_parquet_path, index=False)
|
|
|
|
# 3. Insert ke PostGIS
|
|
# Fungsi ini akan membaca file parquet tadi, membersihkan geom, dan copy ke DB
|
|
db_result = await insert_parquet_to_postgis(temp_parquet_name, table_name)
|
|
|
|
# 4. Simpan Metadata
|
|
# Ambil list geom type dan row count dari hasil insert DB (lebih akurat)
|
|
final_geom_types = [db_result['geom_type']] # Disederhanakan jadi list
|
|
row_count = db_result['row_count']
|
|
|
|
await save_author_metadata(
|
|
payload_author=payload.author,
|
|
table_name=table_name,
|
|
dataset_title=payload.title,
|
|
geom_types=final_geom_types,
|
|
row_count=row_count,
|
|
user_id=user_id
|
|
)
|
|
|
|
# 5. Logging Activity
|
|
await log_activity(
|
|
user_id=user_id,
|
|
action_type="UPLOAD",
|
|
action_title=f"Upload dataset {table_name}",
|
|
details={"table_name": table_name, "rows": row_count}
|
|
)
|
|
|
|
# 6. Post-Processing (External APIs)
|
|
result = {
|
|
"job_id": job_id,
|
|
"job_status": "wait",
|
|
"table_name": table_name,
|
|
"status": "success",
|
|
"message": f"Tabel '{table_name}' berhasil dibuat.",
|
|
"total_rows": row_count,
|
|
"geometry_type": final_geom_types,
|
|
"crs": payload.author.get("crs", "EPSG:4326"),
|
|
"metadata_uuid": ""
|
|
}
|
|
|
|
# Save Style (SLD)
|
|
save_xml_to_sld(payload.style, job_id)
|
|
|
|
# CLEANSING WITH QUERY
|
|
try:
|
|
cleansing_status = await call_cleansing_procedure(table_name)
|
|
except Exception as e:
|
|
cleansing_status = "failed"
|
|
print(f"Cleansing warning: {e}")
|
|
result['job_status'] = cleansing_status
|
|
|
|
# Publish Layer (Geoserver/Geonetwork)
|
|
publish_info = await publish_mapset(table_name, job_id)
|
|
result['metadata_uuid'] = publish_info.get('uuid', '')
|
|
|
|
# 7. Upload to Main Portal (Mapset Integration)
|
|
mapset_payload = {
|
|
"name": payload.title,
|
|
"description": payload.author.get("abstract"),
|
|
"scale": "1:25000",
|
|
# ID Hardcoded sesuai kode asli (pertimbangkan pindah ke config/env)
|
|
'projection_system_id': '0196c746-d1ba-7f1c-9706-5df738679cc7',
|
|
"category_id": payload.author.get("mapsetCategory"),
|
|
"data_status": "sementara",
|
|
'classification_id': '01968b4b-d3f9-76c9-888c-ee887ac31ce4',
|
|
'producer_id': '01968b54-0000-7a67-bd10-975b8923b93e',
|
|
"layer_type": final_geom_types[0],
|
|
'source_id': ['019c03ef-35e1-738b-858d-871dc7d1e4d6'],
|
|
"layer_url": publish_info.get('geos_link', ''),
|
|
"metadata_url": f"{GEONETWORK_URL}/srv/eng/catalog.search#/metadata/{publish_info.get('uuid', '')}",
|
|
"coverage_level": "provinsi",
|
|
"coverage_area": "kabupaten",
|
|
"data_update_period": "Tahunan",
|
|
"data_version": "2026",
|
|
"is_popular": False,
|
|
"is_active": True,
|
|
'regional_id': '01968b53-a910-7a67-bd10-975b8923b92e',
|
|
"notes": "Mapset baru dibuat",
|
|
"status_validation": "on_verification",
|
|
}
|
|
|
|
# await upload_to_main(mapset_payload)
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
# Error Handling & Logging
|
|
await log_activity(
|
|
user_id=user_id,
|
|
action_type="ERROR",
|
|
action_title="Upload gagal",
|
|
details={"error": str(e)}
|
|
)
|
|
print(f"[ERROR] execute_postgis_ingestion: {e}")
|
|
# Re-raise sebagai HTTP Exception agar router mengembalikan 500 yang rapi
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|