import os import shutil import pandas as pd from fastapi import UploadFile, HTTPException from typing import Optional # --- Internal Modules --- from .api.schemas import UploadRequest, PdfRequest from .core.processing.analyzer import analyze_and_clean_dataframe, publish_mapset from .core.readers import ( read_csv, read_shp, read_gdb, read_mpk, read_pdf, convert_df ) from .data.repository import ( generate_unique_table_name, insert_parquet_to_postgis, save_author_metadata, call_cleansing_procedure ) from app.mapset_pipeline.utils.file_ops import ( detect_zip_type, generate_job_id, ) from app.mapset_pipeline.utils.formatters import ( save_xml_to_sld, ) # --- Legacy/External Modules (Sesuai kode asli Anda) --- from app.core.config import UPLOAD_FOLDER, MAX_FILE_MB, GEONETWORK_URL from utils.logger_config import log_activity # from api.routers.datasets_router import ( # upload_to_main # ) async def handle_file_analysis( file: UploadFile, page: Optional[str] = "", sheet: Optional[str] = "", fileDesc: Optional[str] = "" ): """ Orchestrator untuk endpoint /upload. 1. Simpan file fisik. 2. Pilih Reader berdasarkan ekstensi. 3. Panggil Processor untuk analisis. 4. Bersihkan file fisik. """ fname = file.filename ext = os.path.splitext(fname)[1].lower() # 1. Validasi & Simpan File # Membaca file in-memory untuk cek ukuran (hati-hati memory usage untuk file besar) contents = await file.read() size_mb = len(contents) / (1024 * 1024) if size_mb > MAX_FILE_MB: raise HTTPException(status_code=413, detail="Ukuran File Terlalu Besar") tmp_path = UPLOAD_FOLDER / fname # Pastikan folder ada os.makedirs(UPLOAD_FOLDER, exist_ok=True) with open(tmp_path, "wb") as f: f.write(contents) df = None try: # 2. Routing Reader Berdasarkan Ekstensi print(f"[INFO] Processing file type: {ext}") if ext == ".csv": df = read_csv(str(tmp_path)) elif ext == ".xlsx": df = read_csv(str(tmp_path), sheet) # Asumsi read_csv handle xlsx juga sesuai kode asli elif ext == ".mpk": df = read_mpk(str(tmp_path)) elif ext == ".pdf": # Logic PDF agak unik, bisa return list tabel atau df tbl = read_pdf(tmp_path, page) if len(tbl) == 0: return { "message": "Tidak ditemukan tabel valid pada halaman yang dipilih", "tables": {}, "file_type": ext } elif len(tbl) > 1: return { "message": "File berhasil dibaca, ditemukan banyak tabel.", "tables": tbl, "file_type": ext } else: df = convert_df(tbl[0]) elif ext == ".zip": zip_type = detect_zip_type(str(tmp_path)) if zip_type == "shp": df = read_shp(str(tmp_path)) elif zip_type == "gdb": df = read_gdb(str(tmp_path)) else: raise HTTPException(status_code=400, detail="ZIP file tidak mengandung SHP / GDB valid.") else: raise HTTPException(status_code=400, detail="Unsupported file type") # Cek Dataframe Kosong if df is None or (hasattr(df, "empty") and df.empty): raise HTTPException(status_code=422, detail="File berhasil dibaca, tetapi tidak ditemukan tabel valid") # 3. Panggil Processor (Logic Cleaning & Validasi) result_analysis = await analyze_and_clean_dataframe(df, ext, fname, fileDesc) return result_analysis except Exception as e: print(f"[ERROR] handle_file_analysis: {e}") raise HTTPException(status_code=500, detail=str(e)) finally: # 4. Cleanup Uploaded File (Raw File) # Kita hapus file upload asli, tapi file temp parquet (hasil processor) # tetap hidup sampai frontend mengirim request ingest if tmp_path.exists(): try: os.remove(tmp_path) except Exception: pass async def process_pdf_file(payload: PdfRequest): """ Helper khusus jika user mengupload PDF dan ingin memilih tabel tertentu. """ try: # Convert request body ke DataFrame (sesuai logic reader_pdf) # Kita mock convert_df karena di kode asli import dari reader_pdf # yang mungkin mengharapkan format dict khusus df = convert_df(payload.model_dump()) if df is None or (hasattr(df, "empty") and df.empty): raise HTTPException(status_code=422, detail="Tidak ada tabel valid dalam PDF") # Reuse logic processor yang sama return await analyze_and_clean_dataframe( df, '.pdf', payload.fileName, payload.fileDesc ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) async def execute_postgis_ingestion(payload: UploadRequest, user_id: int): """ Orchestrator untuk endpoint /process-to-postgis. 1. Terima data (JSON rows). 2. Convert ke Parquet Temporary. 3. Upload ke PostGIS (via Repository). 4. Simpan Metadata (via Repository). 5. Trigger Cleansing & Publishing. 6. Logging. """ job_id = generate_job_id(str(user_id)) try: # 1. Generate Nama Tabel table_name = await generate_unique_table_name(payload.title) # 2. Persiapan Data (JSON -> DataFrame -> Parquet) # Kita perlu save ke parquet karena repository insert_parquet_to_postgis membaca file # Ini juga memisahkan memory load antara API dan DB Process df = pd.DataFrame(payload.rows) # Upper case columns df.columns = [col.upper() for col in df.columns] # Rename Geometry jika perlu (standarisasi input dari frontend) if "GEOMETRY" in df.columns: df.rename(columns={"GEOMETRY": "GEOM"}, inplace=True) # Simpan ke file temp untuk diproses repository temp_parquet_name = f"{job_id}.parquet" temp_parquet_path = os.path.join("tmp", temp_parquet_name) os.makedirs("tmp", exist_ok=True) # Save parquet (gunakan engine pyarrow atau fastparquet) df.to_parquet(temp_parquet_path, index=False) # 3. Insert ke PostGIS # Fungsi ini akan membaca file parquet tadi, membersihkan geom, dan copy ke DB db_result = await insert_parquet_to_postgis(temp_parquet_name, table_name) # 4. Simpan Metadata # Ambil list geom type dan row count dari hasil insert DB (lebih akurat) final_geom_types = [db_result['geom_type']] # Disederhanakan jadi list row_count = db_result['row_count'] await save_author_metadata( payload_author=payload.author, table_name=table_name, dataset_title=payload.title, geom_types=final_geom_types, row_count=row_count, user_id=user_id ) # 5. Logging Activity await log_activity( user_id=user_id, action_type="UPLOAD", action_title=f"Upload dataset {table_name}", details={"table_name": table_name, "rows": row_count} ) # 6. Post-Processing (External APIs) result = { "job_id": job_id, "job_status": "wait", "table_name": table_name, "status": "success", "message": f"Tabel '{table_name}' berhasil dibuat.", "total_rows": row_count, "geometry_type": final_geom_types, "crs": payload.author.get("crs", "EPSG:4326"), "metadata_uuid": "" } # Save Style (SLD) save_xml_to_sld(payload.style, job_id) # CLEANSING WITH QUERY try: cleansing_status = await call_cleansing_procedure(table_name) except Exception as e: cleansing_status = "failed" print(f"Cleansing warning: {e}") result['job_status'] = cleansing_status # Publish Layer (Geoserver/Geonetwork) publish_info = await publish_mapset(table_name, job_id) result['metadata_uuid'] = publish_info.get('uuid', '') # 7. Upload to Main Portal (Mapset Integration) mapset_payload = { "name": payload.title, "description": payload.author.get("abstract"), "scale": "1:25000", # ID Hardcoded sesuai kode asli (pertimbangkan pindah ke config/env) 'projection_system_id': '0196c746-d1ba-7f1c-9706-5df738679cc7', "category_id": payload.author.get("mapsetCategory"), "data_status": "sementara", 'classification_id': '01968b4b-d3f9-76c9-888c-ee887ac31ce4', 'producer_id': '01968b54-0000-7a67-bd10-975b8923b93e', "layer_type": final_geom_types[0], 'source_id': ['019c03ef-35e1-738b-858d-871dc7d1e4d6'], "layer_url": publish_info.get('geos_link', ''), "metadata_url": f"{GEONETWORK_URL}/srv/eng/catalog.search#/metadata/{publish_info.get('uuid', '')}", "coverage_level": "provinsi", "coverage_area": "kabupaten", "data_update_period": "Tahunan", "data_version": "2026", "is_popular": False, "is_active": True, 'regional_id': '01968b53-a910-7a67-bd10-975b8923b92e', "notes": "Mapset baru dibuat", "status_validation": "on_verification", } # await upload_to_main(mapset_payload) return result except Exception as e: # Error Handling & Logging await log_activity( user_id=user_id, action_type="ERROR", action_title="Upload gagal", details={"error": str(e)} ) print(f"[ERROR] execute_postgis_ingestion: {e}") # Re-raise sebagai HTTP Exception agar router mengembalikan 500 yang rapi raise HTTPException(status_code=500, detail=str(e))