export

2026-02-23 12:20:42 +07:00 · 2026-02-23 12:20:42 +07:00 · 6c0d8729f7
commit 6c0d8729f7
parent 654ad382fe
160 changed files with 3403 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.env.example
+++ b/.env.example
--- a/.gitignore
+++ b/.gitignore
@ -172,3 +172,6 @@ cython_debug/

 # PyPI configuration file
 .pypirc
+
+
+tests/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
--- a/.python-version
+++ b/.python-version
--- a/0
+++ b/0
--- a/README.md
+++ b/README.md
--- a/addons.txt
+++ b/addons.txt
@ -0,0 +1,11 @@
+pandas = "^3.0.0"
+geopandas = "^1.1.2"
+fiona = "^1.10.1"
+numpy = "^2.4.2"
+pdfplumber = "^0.11.9"
+py7zr = "^1.1.0"
+pyogrio = "^0.12.1"
+rapidfuzz = "^3.14.3"
+requests = "^2.32.5"
+openpyxl = "^3.1.5"
+pyarrow = "21.0.0"
--- a/alembic.ini
+++ b/alembic.ini
--- a/app/init.py
+++ b/app/init.py
--- a/app/api/dependencies/init.py
+++ b/app/api/dependencies/init.py
--- a/app/api/dependencies/auth.py
+++ b/app/api/dependencies/auth.py
--- a/app/api/dependencies/database.py
+++ b/app/api/dependencies/database.py
--- a/app/api/dependencies/factory.py
+++ b/app/api/dependencies/factory.py
--- a/app/api/v1/init.py
+++ b/app/api/v1/init.py
--- a/app/api/v1/routes/init.py
+++ b/app/api/v1/routes/init.py
--- a/app/api/v1/routes/auth_route.py
+++ b/app/api/v1/routes/auth_route.py
--- a/app/api/v1/routes/category_route.py
+++ b/app/api/v1/routes/category_route.py
--- a/app/api/v1/routes/classification_route.py
+++ b/app/api/v1/routes/classification_route.py
--- a/app/api/v1/routes/count_route.py
+++ b/app/api/v1/routes/count_route.py
--- a/app/api/v1/routes/credential_route.py
+++ b/app/api/v1/routes/credential_route.py
--- a/app/api/v1/routes/feedback_route.py
+++ b/app/api/v1/routes/feedback_route.py
--- a/app/api/v1/routes/file_route.py
+++ b/app/api/v1/routes/file_route.py
--- a/app/api/v1/routes/geonetwork_route.py
+++ b/app/api/v1/routes/geonetwork_route.py
--- a/app/api/v1/routes/map_projection_system_route.py
+++ b/app/api/v1/routes/map_projection_system_route.py
--- a/app/api/v1/routes/map_source_route.py
+++ b/app/api/v1/routes/map_source_route.py
--- a/app/api/v1/routes/mapset_history_route.py
+++ b/app/api/v1/routes/mapset_history_route.py
--- a/app/api/v1/routes/mapset_route.py
+++ b/app/api/v1/routes/mapset_route.py
--- a/app/api/v1/routes/news_route.py
+++ b/app/api/v1/routes/news_route.py
--- a/app/api/v1/routes/organization_route.py
+++ b/app/api/v1/routes/organization_route.py
--- a/app/api/v1/routes/regional_route.py
+++ b/app/api/v1/routes/regional_route.py
--- a/app/api/v1/routes/role_route.py
+++ b/app/api/v1/routes/role_route.py
--- a/app/api/v1/routes/user_route.py
+++ b/app/api/v1/routes/user_route.py
--- a/app/core/init.py
+++ b/app/core/init.py
--- a/app/core/config.py
+++ b/app/core/config.py
--- a/app/core/data_types.py
+++ b/app/core/data_types.py
--- a/app/core/database.py
+++ b/app/core/database.py
--- a/app/core/exceptions.py
+++ b/app/core/exceptions.py
--- a/app/core/minio_client.py
+++ b/app/core/minio_client.py
--- a/app/core/params.py
+++ b/app/core/params.py
--- a/app/core/responses.py
+++ b/app/core/responses.py
--- a/app/core/security.py
+++ b/app/core/security.py
--- a/app/main.py
+++ b/app/main.py
--- a/app/mapset_pipeline/init,.py
+++ b/app/mapset_pipeline/init,.py
--- a/app/mapset_pipeline/api/router.py
+++ b/app/mapset_pipeline/api/router.py
@ -0,0 +1,41 @@
+# services/file_pipeline/router.py
+from fastapi import APIRouter, Depends, File, UploadFile, Form
+from .schemas import UploadRequest, PdfRequest
+from app.mapset_pipeline.service import handle_file_analysis, process_pdf_file, execute_postgis_ingestion
+from app.response.res import successRes, errorRes
+
+router = APIRouter(prefix="/pipeline", tags=["File Pipeline"])
+
+@router.post("/analyze")
+async def upload_file(
+    file: UploadFile = File(...), 
+    page: str = Form(""), 
+    sheet: str = Form(""), 
+    fileDesc: str = Form("")
+):
+    try:
+        data = await handle_file_analysis(file, page, sheet, fileDesc)
+        return successRes(data=data)
+    except Exception as e:
+        return errorRes(message="Upload failed", details=str(e), status_code=500)
+    
+
+@router.post("/analyze/pdf")
+async def upload_file(
+    payload: PdfRequest
+):
+    try:
+        res = await process_pdf_file(payload)
+        return res
+    except Exception as e:
+        return errorRes(message="Upload failed", details=str(e), status_code=500)
+
+
+@router.post("/publish")
+async def process_to_postgis(payload: UploadRequest):
+    # user_id bisa diambil dari dependency injection auth
+    try:
+        data = await execute_postgis_ingestion(payload, user_id=2) 
+        return successRes(data=data)
+    except Exception as e:
+        return errorRes(message="Processing failed", details=str(e), status_code=500)
--- a/app/mapset_pipeline/api/schemas.py
+++ b/app/mapset_pipeline/api/schemas.py
@ -0,0 +1,17 @@
+from pydantic import BaseModel
+from typing import List, Dict, Any
+
+class PdfRequest(BaseModel):
+    title: str
+    columns: List[str]
+    rows: List[List]
+    fileName: str
+    fileDesc: str
+
+class UploadRequest(BaseModel):
+    title: str
+    path: str
+    rows: List[dict]
+    columns: List[str]
+    author: Dict[str, Any]
+    style: str
--- a/app/mapset_pipeline/core/clients/ai_client.py
+++ b/app/mapset_pipeline/core/clients/ai_client.py
@ -0,0 +1,49 @@
+import requests
+from typing import Dict, Any
+from app.core.config import GEN_AI_URL
+
+URL = GEN_AI_URL
+
+
+def generate_metadata(payload: Dict[str, Any]) -> Dict[str, Any]:
+    headers = {
+        "Content-Type": "application/json",
+        "API_KEY": "testsatupeta"
+    }
+
+    try:
+        response = requests.post(
+            f"{URL}",
+            json=payload,
+            headers=headers,
+        )
+
+        # response.raise_for_status()
+        return response.json()
+
+    except requests.exceptions.RequestException as e:
+        return {
+            "success": False,
+            "error": str(e)
+        }
+
+
+if __name__ == "__main__":
+    # Contoh payload
+    payload = {
+        "nama_file_peta": "peta bencana.pdf",
+        "nama_opd": "Badan Penanggulangan Bencana Daerah (BPBD)",
+        "tipe_data_spasial": "Multipolygon",
+        "struktur_atribut_data": {},
+        "metadata": {
+            "judul": "",
+            "abstrak": "",
+            "tujuan": "",
+            "keyword": [],
+            "kategori": [],
+            "kategori_mapset": ""
+        }
+    }
+
+    result = generate_metadata(payload)
+    print(result)
--- a/app/mapset_pipeline/core/processing/analyzer.py
+++ b/app/mapset_pipeline/core/processing/analyzer.py
@ -0,0 +1,181 @@
+import os
+import asyncio
+import pandas as pd
+import geopandas as gpd
+from app.response.res import errorRes
+
+from app.mapset_pipeline.utils.file_ops import generate_unique_filename, dataframe_validation
+from app.mapset_pipeline.utils.formatters import safe_json
+from .geometry_build import is_geom_empty, detect_and_build_geometry, attach_polygon_geometry_auto
+from app.mapset_pipeline.core.clients.ai_client import generate_metadata
+from app.mapset_pipeline.core.publication.publish_geoserver import publish_layer_to_geoserver
+from app.mapset_pipeline.core.publication.publish_geonetwork import publish_metadata
+
+async def analyze_and_clean_dataframe(df: pd.DataFrame, ext: str, filename: str, fileDesc: str):
+    """
+    Fungsi utama untuk memproses DataFrame:
+    1. Deteksi Geometri
+    2. Validasi & Hitung Statistik
+    3. Generate Preview & Warnings
+    4. Generate Metadata (AI)
+    5. Simpan ke Temporary Parquet
+    """
+    
+    # 1. Deteksi Geometri
+    result = detect_and_build_geometry(df, master_polygons=None)
+
+    if not hasattr(result, "geometry") or result.geometry.isna().all():
+        result = attach_polygon_geometry_auto(result)
+
+    def normalize_geom_type(geom_type):
+        if geom_type and geom_type.startswith("Multi"):
+            return geom_type.replace("Multi", "")
+        return geom_type
+
+    # 2. Analisis Tipe Geometri
+    if isinstance(result, gpd.GeoDataFrame) and "geometry" in result.columns:
+        geom_types = (
+            result.geometry
+            .dropna()
+            .geom_type
+            .apply(normalize_geom_type)
+            .unique()
+        )
+        geom_type = geom_types[0] if len(geom_types) > 0 else "None"
+        null_geom = result.geometry.isna().sum()
+
+        print(f"[INFO] Tipe Geometry: {geom_type}")
+        print(f"[INFO] Jumlah geometry kosong: {null_geom}")
+    else:
+        # Fallback jika gagal mendeteksi geometry
+        res = {
+            "message": "Tidak menemukan tabel yang relevan atau kolom geometri.",
+            "file_type": ext,
+            "rows": len(df),
+            "columns": len(df.columns),
+            "geometry_valid": 0,
+            "geometry_empty": 0,
+            "geometry_valid_percent": 0,
+            "warnings": [],
+            "warning_examples": [],
+            "preview": []
+        }
+        # Kita raise error dictionary agar bisa ditangkap oleh router/service
+        # Atau return dictionary error structure
+        return errorRes(message="Tidak berhasil mencocokan geometry pada tabel.", details=res, status_code=422)
+
+    # 3. Cleaning Data Values
+    result = result.replace([pd.NA, float('inf'), float('-inf')], None)
+    
+    # Convert Geometry ke WKT untuk analisis teks
+    if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns:
+        # Kita perlu simpan WKT string agar serializable saat preview
+        # Tapi biarkan geometry asli untuk proses parquet nanti
+        pass 
+
+    # Hitung Statistik Validitas
+    empty_count = result['geometry'].apply(is_geom_empty).sum()
+    valid_count = len(result) - empty_count
+    match_percentage = (valid_count / len(result)) * 100
+
+    warnings = []
+    if empty_count > 0:
+        warnings.append(
+            f"{empty_count} dari {len(result)} baris tidak memiliki geometry yang valid "
+            f"({100 - match_percentage:.2f}% data gagal cocok)."
+        )
+
+    # Ambil contoh data error
+    if empty_count > 0:
+        examples = result[result['geometry'].apply(is_geom_empty)].head(500)
+        warning_examples = examples.to_dict(orient="records")
+    else:
+        warning_examples = []
+
+    # Prepare Preview Data (Convert WKT for JSON response)
+    # Kita copy agar tidak merusak dataframe utama
+    data_df = result.copy()
+    if 'geometry' in data_df.columns:
+        data_df['geometry'] = data_df['geometry'].apply(
+            lambda g: g.wkt if g is not None else None
+        )
+    
+    preview_data = data_df.to_dict(orient="records")
+    
+    # Sanitasi JSON (numpy types -> python types)
+    preview_safe = [
+        {k: safe_json(v) for k, v in row.items()} for row in preview_data
+    ]
+
+    warning_safe = [
+        {k: safe_json(v) for k, v in row.items()} for row in warning_examples
+    ]
+    
+    # 4. AI Metadata Generation
+    ai_context = {
+        "nama_file_peta": filename,
+        "nama_opd": "Badan Penanggulangan Bencana Daerah (BPBD) Provinsi Jatim", # Sebaiknya dinamis
+        "tipe_data_spasial": geom_type,
+        "deskripsi_singkat": fileDesc,
+        "struktur_atribut_data": {}, 
+    }
+    
+    try:
+        ai_suggest = generate_metadata(ai_context)
+    except Exception as e:
+        print(f"[WARNING] Gagal generate metadata AI: {e}")
+        ai_suggest = {}
+
+    # 5. Simpan ke Temporary Parquet
+    # Gunakan filename unik agar thread safe
+    tmp_file = generate_unique_filename(folder="tmp", ext="parquet")
+    
+    # Proses konversi synchronous dijalankan di thread terpisah agar tidak blocking
+    print('start')
+    await asyncio.to_thread(dataframe_validation, data_df, tmp_file)
+    print('pass')
+
+    response = {
+        "message": "File berhasil dibaca dan dianalisis.",
+        "file_name": filename,
+        "file_type": ext,
+        "rows": int(len(result)),
+        "columns": list(map(str, result.columns)),
+        "geometry_valid": int(valid_count),
+        "geometry_empty": int(empty_count),
+        "geometry_valid_percent": float(round(match_percentage, 2)),
+        "geometry_type": geom_type,
+        "warnings": warnings,
+        "warning_rows": warning_safe,
+        "preview": preview_safe,
+        "metadata_suggest": ai_suggest,
+        "tmp_path": tmp_file
+    }
+    
+    return response
+
+
+async def publish_mapset(table_name: str, job_id: str):
+    try:
+
+        geos_link = publish_layer_to_geoserver(table_name, job_id)
+
+        uuid = await publish_metadata(
+            table_name=table_name,
+            geoserver_links=geos_link
+        )
+
+        # await update_job_status(table_name, "FINISHED", job_id)
+        
+        # return uuid
+        return {
+            "geos_link": geos_link["layer_url"],
+            # "uuid": uuid
+            "uuid": "123123"
+        }
+
+    except Exception as e:
+        # await update_job_status(table_name, "FAILED", job_id)
+        raise RuntimeError(f"Publish layer gagal: {e}") from e
+
+
--- a/app/mapset_pipeline/core/processing/geometry_build.py
+++ b/app/mapset_pipeline/core/processing/geometry_build.py
@ -0,0 +1,466 @@
+import geopandas as gpd
+from shapely.geometry import Point, LineString
+import pandas as pd
+import numpy as np
+import re
+import os
+from shapely import wkt
+from rapidfuzz import process, fuzz
+from sqlalchemy import create_engine
+from shapely.geometry.base import BaseGeometry
+from app.core.config import REFERENCE_DB_URL, REFERENCE_SCHEMA, DESA_REF, KEC_REF, KAB_REF
+
+# ============================================================
+# KONFIGURASI DAN KONSTANTA
+# ============================================================
+
+COLUMN_ALIASES = {
+    'desa': ['desa', 'kelurahan', 'desa_kelurahan', 'desa/kelurahan', 'nama_desa', 'nama_kelurahan', 'Desa/Kel'],
+    'kecamatan': ['kec', 'kecamatan', 'nama_kec', 'nama_kecamatan'],
+    'kabupaten': ['kab', 'kabupaten', 'kota', 'kabupaten_kota', 'kota_kabupaten', 'kab/kota', 'kota/kabupaten', 'kota/kab']
+}
+
+# ============================================================
+# FUNGSI BANTU ADMINISTRATIF
+# ============================================================
+
+def find_admin_column(df, aliases):
+    """Mencari kolom yang paling cocok untuk tiap level admin (desa/kec/kab)"""
+    matched = {}
+    for level, alias_list in aliases.items():
+        for col in df.columns:
+            col_norm = col.strip().lower().replace(' ', '_').replace('/', '_')
+            if any(alias in col_norm for alias in alias_list):
+                matched[level] = col
+                break
+    return matched
+
+
+def detect_smallest_admin_level(df):
+    """Mendeteksi level administratif terkecil yang ada di DataFrame"""
+    cols = [c.lower() for c in df.columns]
+    if any('desa' in c or 'kelurahan' in c for c in cols):
+        return 'desa'
+    elif any('kecamatan' in c for c in cols):
+        return 'kecamatan'
+    elif any('kab' in c or 'kota' in c for c in cols):
+        return 'kabupaten'
+    return None
+
+
+def fuzzy_merge(df, master, left_key, right_key, threshold=85):
+    """Melakukan fuzzy matching antar nama wilayah"""
+    matches = df[left_key].apply(
+        lambda x: process.extractOne(str(x), master[right_key], score_cutoff=threshold)
+    )
+    df['match'] = matches.apply(lambda m: m[0] if m else None)
+    merged = df.merge(master, left_on='match', right_on=right_key, how='left')
+    return merged
+
+
+
+
+
+def normalize_name(name: str, level: str = None):
+    if not isinstance(name, str):
+        return None
+
+    name = name.strip()
+    if not name:
+        return None
+    
+    name = re.sub(r'\s*\([^)]*\)\s*', '', name)
+
+    raw = name.lower()
+    raw = re.sub(r'^(desa|kelurahan|kel|dusun|kampung)\s+', '', raw)
+    raw = re.sub(r'^(kecamatan|kec)\s+', '', raw)
+    raw = re.sub(r'^(kabupaten|kab\.?|kab)\s+', '', raw)
+
+    if level in ["kabupaten", "kota"]:
+        raw = re.sub(r'^(kota\s+)', '', raw)
+
+    raw = re.sub(r'[^a-z\s]', '', raw)
+    raw = re.sub(r'\s+', ' ', raw).strip()
+
+    tokens = raw.split()
+
+    merged_tokens = []
+    i = 0
+    while i < len(tokens):
+        if i < len(tokens) - 1:
+            sim = fuzz.ratio(tokens[i], tokens[i + 1])
+            if sim > 75:
+                merged_tokens.append(tokens[i] + tokens[i + 1])
+                i += 2
+                continue
+        merged_tokens.append(tokens[i])
+        i += 1
+
+    cleaned_tokens = []
+    prev = None
+    for tok in merged_tokens:
+        if prev and fuzz.ratio(prev, tok) > 95:
+            continue 
+        cleaned_tokens.append(tok)
+        prev = tok
+
+    raw = " ".join(cleaned_tokens)
+    formatted = raw.title()
+
+    if level in ["kabupaten", "kota"]:
+        if "kota" in name.lower():
+            if not formatted.startswith("Kota "):
+                formatted = f"Kota {formatted}"
+        else:
+            formatted = formatted.replace("Kota ", "")
+
+    return formatted
+
+
+
+
+def is_geom_empty(g):
+    """True jika geometry None, NaN, atau geometry Shapely kosong."""
+    if g is None:
+        return True
+    if isinstance(g, float) and pd.isna(g):
+        return True
+    if isinstance(g, BaseGeometry):
+        return g.is_empty
+    return False
+
+
+
+
+
+import math
+
+def normalize_lon(val, is_lat=False):
+    if pd.isna(val):
+        return None
+    try:
+        v = float(val)
+    except:
+        return None
+
+    av = abs(v)
+    if av == 0:
+        return v
+
+    if (-180 <= v <= 180 and not is_lat) or (-90 <= v <= 90 and is_lat):
+        return v
+    
+    for factor in [1, 10, 100, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9]:
+        nv = v / factor
+        if (not is_lat and -180 <= nv <= 180) or (is_lat and -90 <= nv <= 90):
+            return nv
+
+    return None
+
+
+
+def normalize_lat(val):
+    if pd.isna(val): 
+        return None
+    v = float(val)
+    av = abs(v)
+    if av > 1e9:   # contoh: -8167413802 (10 digit)
+        return v / 1e9
+    elif av > 1e8: # fallback jika ada variasi
+        return v / 1e8
+    else:
+        return v
+
+
+# ============================================================
+# FUNGSI UTAMA GEOMETRY DETECTION (LAT/LON / PATH)
+# ============================================================
+def detect_and_build_geometry(df: pd.DataFrame, master_polygons: gpd.GeoDataFrame = None):
+    """
+    Mendeteksi dan membentuk geometry dari DataFrame.
+    Bisa dari lat/lon, WKT, atau join ke master polygon (jika disediakan).
+    """
+
+    if isinstance(df, gpd.GeoDataFrame):
+        geom_cols = [
+            c for c in df.columns
+            if re.match(r'^(geometry|geom|the_geom|wkb_geometry)$', c, re.IGNORECASE)
+            or c.lower().startswith("geom")
+            or c.lower().endswith("geometry")
+        ]
+        # if "geometry" in df.columns and df.geometry.notna().any():
+        if geom_cols:
+            geom_count = df.geometry.notna().sum()
+            geom_type = list(df.geom_type.unique())
+            print(f"[INFO] Detected existing geometry in GeoDataFrame ({geom_count} features, {geom_type}).")
+            return df
+
+    lat_col = next((c for c in df.columns if re.search(r'\b(lat|latitude|y[_\s]*coord|y$)\b', c.lower())), None)
+    lon_col = next((c for c in df.columns if re.search(r'\b(lon|long|longitude|x[_\s]*coord|x$)\b', c.lower())), None)
+
+    if lat_col and lon_col:
+        df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
+        df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
+
+        df[lon_col] = df[lon_col].apply(lambda x: normalize_lon(x, is_lat=False))
+        df[lat_col] = df[lat_col].apply(normalize_lat)
+
+        gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326")
+        print("[INFO] Geometry dibangun dari kolom lat/lon.")
+        return gdf
+
+    coord_col = next(
+        (c for c in df.columns if re.search(r'(geom|geometry|wkt|shp|shape|path|coord)', c.lower())), None
+    )
+
+    if coord_col and df[coord_col].notnull().any():
+        sample_val = str(df[coord_col].dropna().iloc[0]).strip()
+
+        if sample_val.startswith('['):
+            def parse_geom(val):
+                try:
+                    pts = eval(val)
+                    return LineString(pts)
+                except Exception:
+                    return None
+            df['geometry'] = df[coord_col].apply(parse_geom)
+            gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
+            print("[INFO] Geometry dibangun dari kolom koordinat/path (list of points).")
+            return gdf
+
+        elif any(x in sample_val.upper() for x in ["POINT", "LINESTRING", "POLYGON"]):
+            try:
+                df['geometry'] = df[coord_col].apply(
+                    lambda g: wkt.loads(g) if isinstance(g, str) and any(
+                        x in g.upper() for x in ["POINT", "LINESTRING", "POLYGON"]
+                    ) else None
+                )
+                gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
+                print("[INFO] Geometry dibangun dari kolom WKT (Point/Line/Polygon/MultiPolygon).")
+                return gdf
+            except Exception as e:
+                print(f"[WARN] Gagal parsing kolom geometry sebagai WKT: {e}")
+
+
+
+    if master_polygons is not None:
+        df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_').str.replace('/', '_')
+        matches = find_admin_column(df, COLUMN_ALIASES)
+
+        if 'desa' in matches:
+            admin_col = matches['desa']
+            merged = df.merge(master_polygons, left_on=admin_col, right_on='nama_desa', how='left')
+            if merged['geometry'].isna().sum() > 0:
+                merged = fuzzy_merge(df, master_polygons, admin_col, 'nama_desa')
+            gdf = gpd.GeoDataFrame(merged, geometry='geometry', crs=master_polygons.crs)
+            return gdf
+
+        elif 'kecamatan' in matches:
+            admin_col = matches['kecamatan']
+            merged = df.merge(master_polygons, left_on=admin_col, right_on='nama_kecamatan', how='left')
+            gdf = gpd.GeoDataFrame(merged, geometry='geometry', crs=master_polygons.crs)
+            return gdf
+
+        elif 'kabupaten' in matches:
+            admin_col = matches['kabupaten']
+            merged = df.merge(master_polygons, left_on=admin_col, right_on='nama_kabupaten', how='left')
+            gdf = gpd.GeoDataFrame(merged, geometry='geometry', crs=master_polygons.crs)
+            return gdf
+
+    print("[WARN] Tidak ditemukan geometry (lat/lon, path, atau master).")
+    return df
+
+
+# def get_reference_polygons(level):
+#     """Mengambil data batas wilayah (MultiPolygon) dari DB referensi"""
+#     table_map = {
+#         'desa': f"{REFERENCE_SCHEMA}.administrasi_ar_keldesa_jatim",
+#         'kecamatan': f"{REFERENCE_SCHEMA}.administrasi_ar_kec_jatim",
+#         'kabupaten': f"{REFERENCE_SCHEMA}.administrasi_ar_kabkot_jatim"
+#     }
+
+#     table_name = table_map.get(level)
+#     if not table_name:
+#         raise ValueError(f"Tidak ada tabel referensi untuk level '{level}'.")
+
+#     engine = create_engine(REFERENCE_DB_URL)
+#     query = f"SELECT *, ST_Multi(geom) AS geometry FROM {table_name}"
+#     gdf = gpd.read_postgis(query, engine, geom_col='geometry')
+
+#     print(f"[INFO] {len(gdf)} data referensi '{level}' berhasil dimuat dari {table_name}.")
+#     return gdf
+
+
+from functools import lru_cache
+
+@lru_cache(maxsize=3)
+def get_reference_polygons(level):
+    local_path = f"cache/{level}_ref.parquet"
+    if os.path.exists(local_path):
+        print(f"[CACHE] Memuat referensi '{level}' dari file lokal.")
+        return gpd.read_parquet(local_path)
+
+    print(f"[DB] Mengambil data referensi '{level}' dari database...")
+    table_map = {
+        "desa": f"{REFERENCE_SCHEMA}.administrasi_ar_keldesa_jatim",
+        "kecamatan": f"{REFERENCE_SCHEMA}.administrasi_ar_kec_jatim",
+        "kabupaten": f"{REFERENCE_SCHEMA}.administrasi_ar_kabkot_jatim"
+    }
+    table_name = table_map.get(level)
+    engine = create_engine(REFERENCE_DB_URL)
+    query = f"SELECT *, ST_Multi(geom) AS geometry FROM {table_name}"
+    gdf = gpd.read_postgis(query, engine, geom_col="geometry")
+    gdf.to_parquet(local_path)
+    print(f"[CACHE] Disimpan ke {local_path}")
+    return gdf
+
+
+
+
+
+
+# ============================================================
+# Optimize Join
+# ============================================================
+def build_join_key(df, cols):
+    arr = df[cols].astype(str).replace("nan", "", regex=False).to_numpy()
+    return np.char.add.reduce(np.column_stack(
+        [arr[:, i] + ("|" if i < len(cols) - 1 else "") for i in range(len(cols))]
+    ), axis=1)
+
+
+# ============================================================
+# FUNGSI: AUTO ATTACH POLYGON KE DATAFRAME NON-SPASIAL
+# ============================================================
+def attach_polygon_geometry_auto(df: pd.DataFrame):
+    """
+    Tambahkan kolom geometry MultiPolygon berdasarkan kombinasi
+    (desa/kelurahan + kecamatan + kabupaten/kota), tanpa duplikasi baris.
+    """
+    level = detect_smallest_admin_level(df)
+    if not level:
+        print("[WARN] Tidak ditemukan kolom administratif (desa/kecamatan/kabupaten).")
+        return df
+
+    print(f"[INFO] Detected smallest admin level: {level}")
+    ref_gdf = get_reference_polygons(level)
+
+    desa_col = next((c for c in df.columns if any(x in c.lower() for x in ['desa', 'kelurahan'])), None)
+    kec_col = next((c for c in df.columns if 'kec' in c.lower()), None)
+    kab_col = next((c for c in df.columns if any(x in c.lower() for x in ['kab', 'kota'])), None)
+
+    if desa_col and (not kec_col or not kab_col):
+        print("[ERROR] Kolom 'Desa' ditemukan tetapi kolom 'Kecamatan' dan/atau 'Kabupaten' tidak lengkap.")
+        print(f"[DEBUG] Ditemukan: Desa={desa_col}, Kec={kec_col}, Kab={kab_col}")
+        return df
+
+    elif not desa_col and kec_col and not kab_col:
+        print("[ERROR] Kolom 'Kecamatan' ditemukan tetapi kolom 'Kabupaten/Kota' tidak ditemukan.")
+        print(f"[DEBUG] Ditemukan: Desa={desa_col}, Kec={kec_col}, Kab={kab_col}")
+        return df
+
+    elif kab_col and not desa_col and not kec_col :
+        print("[INFO] Struktur kolom administratif valid (minimal Kabupaten/Kota ditemukan).")
+        print(f"[DEBUG] Ditemukan: Desa={desa_col}, Kec={kec_col}, Kab={kab_col}")
+
+    elif not desa_col and not kec_col and not kab_col:
+        print("[WARN] Tidak ditemukan kolom administratif apapun (Desa/Kecamatan/Kabupaten).")
+        print(f"[DEBUG] Kolom CSV: {list(df.columns)}")
+        return df
+
+    # kolom di referensi
+    desa_ref = DESA_REF
+    kec_ref = KEC_REF
+    kab_ref = KAB_REF
+
+    if desa_col is not None:
+        df[desa_col] = df[desa_col].astype(str).apply(lambda x: normalize_name(x, "desa"))
+
+    if kec_col is not None:
+        df[kec_col] = df[kec_col].astype(str).apply(lambda x: normalize_name(x, "kecamatan"))
+
+    if kab_col is not None:
+        df[kab_col] = df[kab_col].astype(str).apply(lambda x: normalize_name(x, "kabupaten"))
+
+
+    if desa_ref is not None:
+        ref_gdf[desa_ref] = ref_gdf[desa_ref].astype(str).apply(lambda x: normalize_name(x, "desa"))
+
+    if kec_ref is not None:
+        ref_gdf[kec_ref] = ref_gdf[kec_ref].astype(str).apply(lambda x: normalize_name(x, "kecamatan"))
+
+    if kab_ref is not None:
+        ref_gdf[kab_ref] = ref_gdf[kab_ref].astype(str).apply(lambda x: normalize_name(x, "kabupaten"))
+
+
+
+
+    join_cols = [col for col in [desa_col, kec_col, kab_col] if col]
+
+    if not join_cols:
+        print("[ERROR] Tidak ada kolom administratif yang bisa digunakan untuk join key.")
+    else:
+        join_cols_df = [col for col in [desa_col, kec_col, kab_col] if col]
+        join_cols_ref = [col for col in [desa_ref, kec_ref, kab_ref] if col]
+
+        common_depth = min(len(join_cols_df), len(join_cols_ref))
+        join_cols_df = join_cols_df[-common_depth:]    
+        join_cols_ref = join_cols_ref[-common_depth:]  
+
+        # print(f"[DEBUG] Join kolom DF  : {join_cols_df}")
+        # print(f"[DEBUG] Join kolom REF : {join_cols_ref}")
+
+        # df["_join_key"] = df[join_cols_df].astype(str).agg("|".join, axis=1)
+        # ref_gdf["_join_key"] = ref_gdf[join_cols_ref].astype(str).agg("|".join, axis=1)
+
+        df["_join_key"] = build_join_key(df, join_cols_df)
+        ref_gdf["_join_key"] = build_join_key(ref_gdf, join_cols_ref)
+
+
+        # print(f"[INFO] Join key berhasil dibuat dari kolom: {join_cols_df}")
+
+    ref_lookup = ref_gdf[["_join_key", "geometry"]].drop_duplicates(subset=["_join_key"])
+    df = df.merge(ref_lookup, how="left", on="_join_key")
+    matched = df["geometry"].notna().sum()
+    # print(f"[INFO] {matched} dari {len(df)} baris cocok langsung berdasarkan (desa + kec + kab/kota).")
+
+    if matched < len(df):
+        unmatched = df[df["geometry"].isna()]
+        # print(f"[INFO] Melakukan fuzzy match untuk {len(unmatched)} baris yang belum cocok...")
+
+        ref_dict = dict(zip(ref_lookup["_join_key"], ref_lookup["geometry"]))
+
+        def find_fuzzy_geom(row):
+            key = row["_join_key"]
+            if not isinstance(key, str):
+                return None
+            # fuzzy old
+            # match = process.extractOne(key, list(ref_dict.keys()), scorer=fuzz.token_sort_ratio)
+            # fuzzy new
+            match = process.extractOne(
+                key, list(ref_dict.keys()), scorer=fuzz.token_set_ratio, score_cutoff=80
+            )
+
+            if match and match[1] >= 85:
+                return ref_dict[match[0]]
+            return None
+
+        df.loc[df["geometry"].isna(), "geometry"] = df[df["geometry"].isna()].apply(find_fuzzy_geom, axis=1)
+
+    df = df.drop(columns=["_join_key"], errors="ignore")
+
+    # admin_cols = [col for col in [desa_col, kec_col, kab_col] if col and col in df.columns]
+    # if matched < len(df):
+    #     diff = df[df['geometry'].isna()][admin_cols]
+
+    #     print("[DEBUG] Baris yang tidak match:")
+    #     if diff.empty:
+    #         print("(semua baris berhasil match)")
+    #     else:
+    #         print(diff.to_string(index=False))
+
+
+    # print(f"[REPORT] Total match: {df['geometry'].notna().sum()} / {len(df)} ({df['geometry'].notna().mean()*100:.2f}%)")
+
+
+    return gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
--- a/app/mapset_pipeline/core/publication/publish_geonetwork.py
+++ b/app/mapset_pipeline/core/publication/publish_geonetwork.py
@ -0,0 +1,693 @@
+from fastapi import HTTPException
+import requests
+from sqlalchemy import text
+from app.core.config import GEONETWORK_PASS, GEONETWORK_URL, GEONETWORK_USER
+from database.connection import engine
+from datetime import datetime
+from uuid import uuid4
+import re
+
+
+
+def create_gn_session():
+    session = requests.Session()
+    session.auth = (GEONETWORK_USER, GEONETWORK_PASS)
+
+    session.get(f"{GEONETWORK_URL}/srv/eng/info?type=me")
+    xsrf_token = session.cookies.get("XSRF-TOKEN")
+
+    if not xsrf_token:
+        raise Exception("XSRF token missing")
+
+    return session, xsrf_token
+
+
+
+def escape_url_params(url: str) -> str:
+    """
+    Escape karakter berbahaya di dalam URL agar valid dalam XML.
+    Khususnya mengganti '&' menjadi '&amp;' kecuali jika sudah '&amp;'.
+    """
+    # Ganti semua & yang bukan bagian dari &amp;
+    url = re.sub(r'&(?!amp;)', '&amp;', url)
+    return url
+
+
+def fix_xml_urls(xml: str) -> str:
+    """
+    Temukan semua <gmd:URL> ... </gmd:URL> dalam XML dan escape URL-nya.
+    """
+    def replacer(match):
+        original = match.group(1).strip()
+        fixed = escape_url_params(original)
+        return f"<gmd:URL>{fixed}</gmd:URL>"
+
+    # Replace semua <gmd:URL> ... </gmd:URL>
+    xml_fixed = re.sub(
+        r"<gmd:URL>(.*?)</gmd:URL>",
+        replacer,
+        xml,
+        flags=re.DOTALL
+    )
+
+    return xml_fixed
+
+
+
+async def get_extent(table_name: str):
+
+    sql = f"""
+    SELECT 
+      ST_XMin(extent), ST_YMin(extent),
+      ST_XMax(extent), ST_YMax(extent)
+    FROM (
+        SELECT ST_Extent(geom) AS extent 
+        FROM public."{table_name}"
+    ) AS box;
+    """
+
+    async with engine.connect() as conn:
+        result = await conn.execute(sql)
+        row = result.fetchone()
+
+    if not row or row[0] is None:
+        return None
+
+    # return {
+    #     "xmin": row[0], 
+    #     "ymin": row[1],
+    #     "xmax": row[2], 
+    #     "ymax": row[3]
+    # }
+
+    return {
+        "xmin": 110.1372,   # west
+        "ymin": -9.3029,    # south
+        "xmax": 114.5287,   # east
+        "ymax": -5.4819     # north
+    }
+
+async def get_author_metadata(table_name: str):
+
+    sql = """
+        SELECT am.table_title, am.dataset_title, am.dataset_abstract, am.keywords, am.date_created,
+			am.organization_name, am.contact_person_name, am.created_at,
+			am.contact_email, am.contact_phone, am.geom_type,
+			u.organization_id,
+			o.address AS organization_address,
+			o.email AS organization_email,
+			o.phone_number AS organization_phone
+		FROM backend.author_metadata AS am
+		LEFT JOIN backend.users u ON am.user_id = u.id
+		LEFT JOIN backend.organizations o ON u.organization_id = o.id
+		WHERE am.table_title = :table
+		LIMIT 1
+    """
+
+    async with engine.connect() as conn:
+        result = await conn.execute(sql, {"table": table_name})
+        row = result.fetchone()
+
+    if not row:
+        raise Exception(f"Tidak ada metadata untuk tabel: {table_name}")
+
+    # SQLAlchemy Async row support ._mapping untuk convert ke dict
+    return dict(row._mapping)
+
+
+def map_geom_type(gtype):
+
+    if gtype is None:
+        return "surface"
+
+    # Jika LIST → ambil elemen pertama
+    if isinstance(gtype, list):
+        if len(gtype) > 0:
+            gtype = gtype[0]
+        else:
+            return "surface"
+
+    # Setelah pasti string
+    gtype = str(gtype).lower()
+
+    if "polygon" in gtype or "multi" in gtype:
+        return "surface"
+    if "line" in gtype:
+        return "curve"
+    if "point" in gtype:
+        return "point"
+
+    return "surface"
+
+
+def generate_metadata_xml(table_name, meta, extent, geoserver_links):
+
+    keywords_xml = "".join([
+        f"""
+        <gmd:keyword><gco:CharacterString>{kw.strip()}</gco:CharacterString></gmd:keyword>
+        """ for kw in meta["keywords"].split(",")
+    ])
+    
+    geom_type_code = map_geom_type(meta["geom_type"])
+    print('type', geom_type_code)
+    uuid = str(uuid4())
+
+    return f"""
+<gmd:MD_Metadata
+	xmlns:gmd="http://www.isotc211.org/2005/gmd"
+	xmlns:gco="http://www.isotc211.org/2005/gco"
+	xmlns:srv="http://www.isotc211.org/2005/srv"
+	xmlns:gmx="http://www.isotc211.org/2005/gmx"
+	xmlns:gts="http://www.isotc211.org/2005/gts"
+	xmlns:gsr="http://www.isotc211.org/2005/gsr"
+	xmlns:gmi="http://www.isotc211.org/2005/gmi"
+	xmlns:gml="http://www.opengis.net/gml/3.2"
+	xmlns:xlink="http://www.w3.org/1999/xlink"
+	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.isotc211.org/2005/gmd http://schemas.opengis.net/csw/2.0.2/profiles/apiso/1.0.0/apiso.xsd">
+	<gmd:fileIdentifier>
+		<gco:CharacterString>{uuid}</gco:CharacterString>
+	</gmd:fileIdentifier>
+	<gmd:language>
+		<gmd:LanguageCode codeList="http://www.loc.gov/standards/iso639-2/" codeListValue="eng"/>
+	</gmd:language>
+	<gmd:characterSet>
+		<gmd:MD_CharacterSetCode codeListValue="utf8" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_CharacterSetCode"/>
+	</gmd:characterSet>
+	<gmd:hierarchyLevel>
+		<gmd:MD_ScopeCode codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_ScopeCode" codeListValue="feature"/>
+	</gmd:hierarchyLevel>
+	<gmd:contact>
+		<gmd:CI_ResponsibleParty>
+			<gmd:individualName>
+				<gco:CharacterString>{meta['contact_person_name']}</gco:CharacterString>
+			</gmd:individualName>
+			<gmd:organisationName>
+				<gco:CharacterString>{meta['organization_name']}</gco:CharacterString>
+			</gmd:organisationName>
+			<gmd:contactInfo>
+				<gmd:CI_Contact>
+					<gmd:phone>
+						<gmd:CI_Telephone>
+							<gmd:voice>
+								<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
+							</gmd:voice>
+							<gmd:facsimile>
+								<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
+							</gmd:facsimile>
+						</gmd:CI_Telephone>
+					</gmd:phone>
+					<gmd:address>
+						<gmd:CI_Address>
+                        	<gmd:deliveryPoint>
+								<gco:CharacterString>{meta['organization_address']}</gco:CharacterString>
+							</gmd:deliveryPoint>
+							<gmd:city>
+								<gco:CharacterString>Surabaya</gco:CharacterString>
+							</gmd:city>
+							<gmd:administrativeArea>
+								<gco:CharacterString>Jawa Timur</gco:CharacterString>
+							</gmd:administrativeArea>
+							<gmd:country>
+								<gco:CharacterString>Indonesia</gco:CharacterString>
+							</gmd:country>
+							<gmd:electronicMailAddress>
+								<gco:CharacterString>{meta['organization_email']}</gco:CharacterString>
+							</gmd:electronicMailAddress>
+						</gmd:CI_Address>
+					</gmd:address>
+					<gmd:hoursOfService>
+						<gco:CharacterString>08.00-16.00</gco:CharacterString>
+					</gmd:hoursOfService>
+				</gmd:CI_Contact>
+			</gmd:contactInfo>
+			<gmd:role>
+				<gmd:CI_RoleCode codeListValue="pointOfContact" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#CI_RoleCode"/>
+			</gmd:role>
+		</gmd:CI_ResponsibleParty>
+	</gmd:contact>
+	<gmd:dateStamp>
+		<gco:DateTime>{datetime.utcnow().isoformat()}+07:00</gco:DateTime>
+	</gmd:dateStamp>
+	<gmd:metadataStandardName>
+		<gco:CharacterString>ISO 19115:2003/19139</gco:CharacterString>
+	</gmd:metadataStandardName>
+	<gmd:metadataStandardVersion>
+		<gco:CharacterString>1.0</gco:CharacterString>
+	</gmd:metadataStandardVersion>
+	<gmd:spatialRepresentationInfo>
+		<gmd:MD_VectorSpatialRepresentation>
+			<gmd:geometricObjects>
+				<gmd:MD_GeometricObjects>
+					<gmd:geometricObjectType>
+						<gmd:MD_GeometricObjectTypeCode codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_GeometricObjectTypeCode" codeListValue="{geom_type_code}"/>
+					</gmd:geometricObjectType>
+					<gmd:geometricObjectCount>
+						<gco:Integer>38</gco:Integer>
+					</gmd:geometricObjectCount>
+				</gmd:MD_GeometricObjects>
+			</gmd:geometricObjects>
+		</gmd:MD_VectorSpatialRepresentation>
+	</gmd:spatialRepresentationInfo>
+	<gmd:referenceSystemInfo>
+		<gmd:MD_ReferenceSystem>
+			<gmd:referenceSystemIdentifier>
+				<gmd:RS_Identifier>
+					<gmd:code>		
+						<gco:CharacterString>4326</gco:CharacterString>
+					</gmd:code>
+					<gmd:codeSpace>
+						<gco:CharacterString>EPSG</gco:CharacterString>
+					</gmd:codeSpace>
+				</gmd:RS_Identifier>
+			</gmd:referenceSystemIdentifier>
+		</gmd:MD_ReferenceSystem>
+	</gmd:referenceSystemInfo>
+	<gmd:identificationInfo>
+		<gmd:MD_DataIdentification>
+			<gmd:citation>
+				<gmd:CI_Citation>
+					<gmd:title>
+						<gco:CharacterString>{meta['dataset_title']}</gco:CharacterString>
+					</gmd:title>
+					<gmd:date>
+						<gmd:CI_Date>
+							<gmd:date>
+								<gco:DateTime>{meta['created_at'].isoformat()}+07:00</gco:DateTime>
+							</gmd:date>
+							<gmd:dateType>
+								<gmd:CI_DateTypeCode codeListValue="publication" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#CI_DateTypeCode"/>
+							</gmd:dateType>
+						</gmd:CI_Date>
+					</gmd:date>
+					<gmd:edition>
+						<gco:CharacterString>{meta['date_created'].year}</gco:CharacterString>
+					</gmd:edition>
+					<gmd:citedResponsibleParty>
+						<gmd:CI_ResponsibleParty>
+							<gmd:individualName>
+								<gco:CharacterString>{meta['contact_person_name']}</gco:CharacterString>
+							</gmd:individualName>
+							<gmd:organisationName>
+								<gco:CharacterString>{meta['organization_name']}</gco:CharacterString>
+							</gmd:organisationName>
+							<gmd:contactInfo>
+								<gmd:CI_Contact>
+									<gmd:phone>
+										<gmd:CI_Telephone>
+											<gmd:voice>
+												<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
+											</gmd:voice>
+											<gmd:facsimile>
+												<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
+											</gmd:facsimile>
+										</gmd:CI_Telephone>
+									</gmd:phone>
+									<gmd:address>
+										<gmd:CI_Address>
+											<gmd:deliveryPoint>
+												<gco:CharacterString>{meta['organization_address']}</gco:CharacterString>
+											</gmd:deliveryPoint>
+											<gmd:city>
+												<gco:CharacterString>Surabaya</gco:CharacterString>
+											</gmd:city>
+											<gmd:country>
+												<gco:CharacterString>Indonesia</gco:CharacterString>
+											</gmd:country>
+											<gmd:electronicMailAddress>
+												<gco:CharacterString>{meta['organization_email']}</gco:CharacterString>
+											</gmd:electronicMailAddress>
+										</gmd:CI_Address>
+									</gmd:address>
+									<gmd:hoursOfService>
+										<gco:CharacterString>08.00-16.00</gco:CharacterString>
+									</gmd:hoursOfService>
+								</gmd:CI_Contact>
+							</gmd:contactInfo>
+							<gmd:role>
+								<gmd:CI_RoleCode codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#CI_RoleCode" codeListValue="custodian"/>
+							</gmd:role>
+						</gmd:CI_ResponsibleParty>
+					</gmd:citedResponsibleParty>
+					<gmd:otherCitationDetails>
+						<gco:CharacterString>Timezone: UTC+7 (Asia/Jakarta)</gco:CharacterString>
+					</gmd:otherCitationDetails>
+				</gmd:CI_Citation>
+			</gmd:citation>
+			<gmd:abstract>
+				<gco:CharacterString>{meta['dataset_abstract']}</gco:CharacterString>
+			</gmd:abstract>
+			<gmd:purpose>
+				<gco:CharacterString>{meta['dataset_abstract']}</gco:CharacterString>
+			</gmd:purpose>
+			<gmd:status>
+				<gmd:MD_ProgressCode codeListValue="completed" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_ProgressCode"/>
+			</gmd:status>
+			<gmd:pointOfContact>
+				<gmd:CI_ResponsibleParty>
+					<gmd:individualName>
+						<gco:CharacterString>Lab AI Polinema</gco:CharacterString>
+					</gmd:individualName>
+					<gmd:organisationName>
+						<gco:CharacterString>Lab AI Polinema</gco:CharacterString>
+					</gmd:organisationName>
+					<gmd:positionName gco:nilReason="missing"/>
+					<gmd:contactInfo>
+						<gmd:CI_Contact>
+							<gmd:phone>
+								<gmd:CI_Telephone>
+									<gmd:voice>
+										<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
+									</gmd:voice>
+									<gmd:facsimile>
+										<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
+									</gmd:facsimile>
+								</gmd:CI_Telephone>
+							</gmd:phone>
+							<gmd:address>
+								<gmd:CI_Address>
+									<gmd:deliveryPoint>
+										<gco:CharacterString>{meta['organization_address']}</gco:CharacterString>
+									</gmd:deliveryPoint>
+									<gmd:city>
+										<gco:CharacterString>Surabaya</gco:CharacterString>
+									</gmd:city>
+									<gmd:administrativeArea>
+										<gco:CharacterString>Jawa Timur</gco:CharacterString>
+									</gmd:administrativeArea>
+									<gmd:country>
+										<gco:CharacterString>Indonesia</gco:CharacterString>
+									</gmd:country>
+									<gmd:electronicMailAddress>
+										<gco:CharacterString>{meta['organization_email']}</gco:CharacterString>
+									</gmd:electronicMailAddress>
+								</gmd:CI_Address>
+							</gmd:address>
+						</gmd:CI_Contact>
+					</gmd:contactInfo>
+					<gmd:role>
+						<gmd:CI_RoleCode codeListValue="owner" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#CI_RoleCode"/>
+					</gmd:role>
+				</gmd:CI_ResponsibleParty>
+			</gmd:pointOfContact>
+			<gmd:resourceMaintenance>
+				<gmd:MD_MaintenanceInformation>
+					<gmd:maintenanceAndUpdateFrequency>
+						<gmd:MD_MaintenanceFrequencyCode codeListValue="annually" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_MaintenanceFrequencyCode"/>
+					</gmd:maintenanceAndUpdateFrequency>
+				</gmd:MD_MaintenanceInformation>
+			</gmd:resourceMaintenance>
+			<gmd:descriptiveKeywords>
+				<gmd:MD_Keywords>
+					{keywords_xml}
+				</gmd:MD_Keywords>
+			</gmd:descriptiveKeywords>
+			<gmd:resourceConstraints>
+				<gmd:MD_LegalConstraints>
+					<gmd:accessConstraints>
+						<gmd:MD_RestrictionCode codeListValue="copyright" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_RestrictionCode"/>
+					</gmd:accessConstraints>
+					<gmd:useConstraints>
+						<gmd:MD_RestrictionCode codeListValue="otherRestrictions" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_RestrictionCode"/>
+					</gmd:useConstraints>
+					<gmd:otherConstraints>
+						<gco:CharacterString>Penggunaan data harus mencantumkan sumber: {meta['organization_name']}.</gco:CharacterString>
+					</gmd:otherConstraints>
+				</gmd:MD_LegalConstraints>
+			</gmd:resourceConstraints>
+			<gmd:spatialRepresentationType>
+				<gmd:MD_SpatialRepresentationTypeCode codeListValue="vector" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_SpatialRepresentationTypeCode"/>
+			</gmd:spatialRepresentationType>
+			<gmd:spatialResolution>
+				<gmd:MD_Resolution>
+					<gmd:equivalentScale>
+						<gmd:MD_RepresentativeFraction>
+							<gmd:denominator>
+								<gco:Integer>25000</gco:Integer>
+							</gmd:denominator>
+						</gmd:MD_RepresentativeFraction>
+					</gmd:equivalentScale>
+				</gmd:MD_Resolution>
+			</gmd:spatialResolution>
+			<gmd:language>
+				<gmd:LanguageCode codeList="http://www.loc.gov/standards/iso639-2/" codeListValue="eng"/>
+			</gmd:language>
+			<gmd:characterSet>
+				<gmd:MD_CharacterSetCode codeListValue="utf8" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_CharacterSetCode"/>
+			</gmd:characterSet>
+			<gmd:extent>
+				<gmd:EX_Extent>
+				<gmd:geographicElement>
+					<gmd:EX_GeographicBoundingBox>
+					<gmd:westBoundLongitude><gco:Decimal>{extent['xmin']}</gco:Decimal></gmd:westBoundLongitude>
+					<gmd:eastBoundLongitude><gco:Decimal>{extent['xmax']}</gco:Decimal></gmd:eastBoundLongitude>
+					<gmd:southBoundLatitude><gco:Decimal>{extent['ymin']}</gco:Decimal></gmd:southBoundLatitude>
+					<gmd:northBoundLatitude><gco:Decimal>{extent['ymax']}</gco:Decimal></gmd:northBoundLatitude>
+					</gmd:EX_GeographicBoundingBox>
+				</gmd:geographicElement>
+				</gmd:EX_Extent>
+			</gmd:extent>
+		</gmd:MD_DataIdentification>
+	</gmd:identificationInfo>
+	<gmd:contentInfo>
+		<gmd:MD_FeatureCatalogueDescription>
+			<gmd:complianceCode>
+				<gco:Boolean>true</gco:Boolean>
+			</gmd:complianceCode>
+			<gmd:includedWithDataset gco:nilReason="unknown"/>
+			<gmd:featureCatalogueCitation>
+				<gmd:CI_Citation>
+					<gmd:title>
+						<gco:CharacterString>{meta['dataset_title']}</gco:CharacterString>
+					</gmd:title>
+					<gmd:date>
+						<gmd:CI_Date>
+							<gmd:date>
+								<gco:DateTime>{meta['created_at'].isoformat()}+07:00</gco:DateTime>
+							</gmd:date>
+							<gmd:dateType>
+								<gmd:CI_DateTypeCode codeListValue="publication" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#CI_DateTypeCode"/>
+							</gmd:dateType>
+						</gmd:CI_Date>
+					</gmd:date>
+					<gmd:edition>
+						<gco:CharacterString>{meta['date_created'].year}</gco:CharacterString>
+					</gmd:edition>
+				</gmd:CI_Citation>
+			</gmd:featureCatalogueCitation>
+		</gmd:MD_FeatureCatalogueDescription>
+	</gmd:contentInfo>
+	<gmd:distributionInfo>
+		<gmd:MD_Distribution>
+			<gmd:transferOptions>
+				<gmd:MD_DigitalTransferOptions>
+					<gmd:onLine>
+						<gmd:CI_OnlineResource>
+							<gmd:linkage>
+								<gmd:URL>{geoserver_links["wms_url"]}</gmd:URL>
+							</gmd:linkage>
+							<gmd:protocol>
+								<gco:CharacterString>DB:POSTGIS</gco:CharacterString>
+                            </gmd:protocol>
+							<gmd:name>
+								<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
+							</gmd:name>
+							<gmd:description>
+								<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
+							</gmd:description>
+						</gmd:CI_OnlineResource>
+					</gmd:onLine>
+					<gmd:onLine>
+						<gmd:CI_OnlineResource>
+							<gmd:linkage>
+								<gmd:URL>{geoserver_links["wms_url"]}</gmd:URL>
+							</gmd:linkage>
+							<gmd:protocol>
+								<gco:CharacterString>WWW:LINK-1.0-http--link</gco:CharacterString>
+							</gmd:protocol>
+							<gmd:name>
+								<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
+							</gmd:name>
+							<gmd:description>
+								<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
+							</gmd:description>
+						</gmd:CI_OnlineResource>
+					</gmd:onLine>
+					<gmd:onLine>
+						<gmd:CI_OnlineResource>
+							<gmd:linkage>
+								<gmd:URL>{geoserver_links["wms_url"]}</gmd:URL>
+							</gmd:linkage>
+							<gmd:protocol>
+								<gco:CharacterString>OGC:WMS</gco:CharacterString>
+							</gmd:protocol>
+							<gmd:name>
+								<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
+							</gmd:name>
+						</gmd:CI_OnlineResource>
+					</gmd:onLine>
+
+					<gmd:onLine>
+						<gmd:CI_OnlineResource>
+							<gmd:linkage>
+								<gmd:URL>{geoserver_links["wfs_url"]}</gmd:URL>
+							</gmd:linkage>
+							<gmd:protocol>
+								<gco:CharacterString>OGC:WFS</gco:CharacterString>
+							</gmd:protocol>
+							<gmd:name>
+								<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
+							</gmd:name>
+						</gmd:CI_OnlineResource>
+					</gmd:onLine>
+				</gmd:MD_DigitalTransferOptions>
+			</gmd:transferOptions>
+		</gmd:MD_Distribution>
+	</gmd:distributionInfo>
+	<gmd:dataQualityInfo>
+		<gmd:DQ_DataQuality>
+			<gmd:scope>
+				<gmd:DQ_Scope>
+					<gmd:level>
+						<gmd:MD_ScopeCode codeListValue="dataset" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_ScopeCode"/>
+					</gmd:level>
+				</gmd:DQ_Scope>
+			</gmd:scope>
+			<gmd:lineage>
+				<gmd:LI_Lineage>
+					<gmd:statement>
+						<gco:CharacterString>Data dihasilkan dari digitasi peta dasar skala 1:25000 menggunakan QGIS.</gco:CharacterString>
+					</gmd:statement>
+				</gmd:LI_Lineage>
+			</gmd:lineage>
+		</gmd:DQ_DataQuality>
+	</gmd:dataQualityInfo>
+</gmd:MD_Metadata>
+"""
+
+
+# Geonetwork version 4.4.9.0
+def upload_metadata_to_geonetwork(xml_metadata: str):
+    # session = requests.Session()
+    # session.auth = (GEONETWORK_USER, GEONETWORK_PASS)
+
+    # # 1. Get XSRF token
+    # try:
+    #     info_url = f"{GEONETWORK_URL}/srv/eng/info?type=me"
+    #     session.get(info_url)
+    # except requests.exceptions.RequestException as e:
+    #     raise HTTPException(status_code=503, detail=f"Failed to connect to GeoNetwork: {e}")
+
+    # xsrf_token = session.cookies.get('XSRF-TOKEN')
+    # if not xsrf_token:
+    #     raise HTTPException(status_code=500, detail="Could not retrieve XSRF-TOKEN from GeoNetwork.")
+
+    session, xsrf_token = create_gn_session()
+    headers = {
+        'X-XSRF-TOKEN': xsrf_token,
+        'Accept': 'application/json'
+    }
+
+    GN_API_RECORDS_URL = f"{GEONETWORK_URL}/srv/api/records"
+
+    # 2. GeoNetwork requires a multipart/form-data upload
+    files = {
+        'file': ('metadata.xml', xml_metadata, 'application/xml')
+    }
+    
+    params = {
+		"ownerGroup": 1, # all
+		"ownerUser": 1	 # admin
+	}
+
+    response = session.post(
+        GN_API_RECORDS_URL,
+        params=params,
+        files=files,
+        headers=headers,
+        cookies=session.cookies.get_dict()
+    )
+    
+    metadata_infos = response.json().get("metadataInfos", {})
+    uuid = None
+    for records in metadata_infos.values():
+         if records and isinstance(records, list):
+            uuid = records[0].get("uuid")
+            break
+    if not uuid:
+        raise ValueError("UUID not found in GeoNetwork response")
+
+    publish_record(session, uuid)
+    
+	# print("response", response.json())
+    return uuid
+
+    
+
+async def publish_metadata(table_name: str, geoserver_links: dict):
+
+    extent = await get_extent(table_name)
+    meta = await get_author_metadata(table_name)
+    xml = generate_metadata_xml(
+        table_name=meta["dataset_title"],
+        meta=meta,
+        extent=extent,
+        geoserver_links=geoserver_links
+    )
+
+    xml_clean = fix_xml_urls(xml)
+    uuid = upload_metadata_to_geonetwork(xml_clean)
+    
+    print(f"[GeoNetwork] Metadata uploaded. UUID = {uuid}")
+
+    return uuid
+
+
+
+def publish_record(session, uuid):
+    print('[uuid]', uuid)
+    xsrf_token = session.cookies.get('XSRF-TOKEN')
+
+    headers = {
+        "X-XSRF-TOKEN": xsrf_token,
+        "Accept": "application/json",
+        "Content-Type": "application/json"
+    }
+
+    url = f"{GEONETWORK_URL}/srv/api/records/{uuid}/sharing"
+
+    payload = {
+        "clear": True,
+        "privileges": [
+            {
+                "group": 1, 
+                "operations": {
+                    "view": True
+                }
+            }
+        ]
+    }
+
+    response = session.put(url, json=payload, headers=headers)
+    response.raise_for_status()
+
+
+# single stand func
+# def publish_record(uuid):
+#     session, xsrf_token = create_gn_session()
+
+#     headers = {
+#         "X-XSRF-TOKEN": xsrf_token,
+#         "Content-Type": "application/json"
+#     }
+
+#     url = f"{GEONETWORK_URL}/srv/api/records/{uuid}/sharing"
+
+#     payload = {
+#         "clear": True,
+#         "privileges": [
+#             {"group": 1, "operations": {"view": True}}
+#         ]
+#     }
+
+#     resp = session.put(url, json=payload, headers=headers)
+#     resp.raise_for_status()
--- a/app/mapset_pipeline/core/publication/publish_geoserver.py
+++ b/app/mapset_pipeline/core/publication/publish_geoserver.py
@ -0,0 +1,300 @@
+import requests
+import json
+import os
+from app.core.config import GEOSERVER_URL, GEOSERVER_USER, GEOSERVER_PASS, GEOSERVER_WORKSPACE
+
+# DATASTORE = "postgis" #per OPD
+DATASTORE = "server_lokal" 
+# SLD_DIR = "./styles"
+
+# BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 
+# SLD_DIR = os.path.join(BASE_DIR, "styles")
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+MAIN_DIR = os.path.abspath(os.path.join(BASE_DIR, "..", ".."))
+SLD_DIR = os.path.join(MAIN_DIR, "style_temp")
+
+
+def publish_layer_to_geoserver(table: str, job_id: str):
+    print(f"[GeoServer] Publish layer + upload SLD: {table}")
+
+    # ==========================
+    # 1. Publish Feature Type
+    # ==========================
+    # ft_url = f"{GEOSERVER_URL}/rest/workspaces/{GEOSERVER_WORKSPACE}/datastores/{DATASTORE}/featuretypes"
+    ft_url = f"{GEOSERVER_URL}/rest/workspaces/{GEOSERVER_WORKSPACE}/datastores/{DATASTORE}/featuretypes?computeDefault=true"
+
+    payload = {
+        "featureType": {
+            "name": table,
+            "nativeName": table,
+            "enabled": True
+        }
+    }
+
+    requests.post(
+        ft_url,
+        auth=(GEOSERVER_USER, GEOSERVER_PASS),
+        headers={"Content-Type": "application/json"},
+        data=json.dumps(payload)
+    )
+
+    print(f"[GeoServer] FeatureType published for: {table}")
+
+    # ==========================================
+    # 2. Upload SLD file to GeoServer
+    # ==========================================
+
+    sld_file = f"{SLD_DIR}/{job_id}.sld"
+    style_name = table  # style name sama dengan table
+
+    if not os.path.exists(sld_file):
+        print(f"[WARNING] SLD file tidak ditemukan: {sld_file}")
+    else:
+        print(f"[GeoServer] Upload SLD {sld_file}")
+
+        #old
+        # style_url = f"{GEOSERVER_URL}/rest/styles"
+
+        # with open(sld_file, "rb") as sld:
+        #     requests.post(
+        #         f"{style_url}?name={style_name}&workspace={GEOSERVER_WORKSPACE}",
+        #         auth=(GEOSERVER_USER, GEOSERVER_PASS),
+        #         headers={"Content-Type": "application/vnd.ogc.sld+xml"},
+        #         data=sld.read()
+        #     )
+
+        # print(f"[GeoServer] SLD uploaded: {style_name}")
+
+
+
+        #new
+        style_url = (
+            f"{GEOSERVER_URL}/rest/workspaces/"
+            f"{GEOSERVER_WORKSPACE}/styles"
+        )
+        
+        with open(sld_file, "r", encoding="utf-8") as f:
+            sld_content = f.read()
+
+        # 🔥 INI BARIS PENTINGNYA
+        sld_content = sld_content.lstrip("\ufeff \t\r\n")
+
+        resp = requests.post(
+            f"{style_url}?name={style_name}",
+            auth=(GEOSERVER_USER, GEOSERVER_PASS),
+            headers={"Content-Type": "application/vnd.ogc.sld+xml"},
+            data=sld_content.encode("utf-8")
+        )
+
+
+        if resp.status_code not in (200, 201):
+            raise Exception(
+                f"Upload SLD gagal ({resp.status_code}): {resp.text}"
+            )
+
+        print(f"[GeoServer] SLD uploaded: {style_name}")
+
+
+
+
+        # ==========================================
+        # 3. Apply SLD to the layer
+        # ==========================================
+
+        layer_url = f"{GEOSERVER_URL}/rest/layers/{GEOSERVER_WORKSPACE}:{table}"
+
+        payload = {
+            "layer": {
+                "defaultStyle": {
+                    "name": style_name,
+                    "workspace": GEOSERVER_WORKSPACE
+                },
+                "enabled": True
+            }
+        }
+
+        requests.put(
+            layer_url,
+            auth=(GEOSERVER_USER, GEOSERVER_PASS),
+            headers={"Content-Type": "application/json"},
+            data=json.dumps(payload)
+        )
+
+        print(f"[GeoServer] SLD applied as default style for {table}")
+
+        # ==========================================
+        # 4. Delete SLD file from local folder
+        # ==========================================
+
+        os.remove(sld_file)
+        print(f"[CLEANUP] SLD file removed: {sld_file}")
+
+    # ==============================================
+    # 5. Reload GeoServer (optional but recommended)
+    # ==============================================
+    requests.post(
+        f"{GEOSERVER_URL}/rest/reload",
+        auth=(GEOSERVER_USER, GEOSERVER_PASS)
+    )
+
+    # ====================================================
+    # 7. Generate GeoServer WMS/WFS link untuk GeoNetwork
+    # ====================================================
+
+    wms_link = (
+        f"{GEOSERVER_URL}/{GEOSERVER_WORKSPACE}/wms?"
+        f"service=WMS&request=GetMap&layers={GEOSERVER_WORKSPACE}:{table}"
+    )
+    wfs_link = (
+        f"{GEOSERVER_URL}/{GEOSERVER_WORKSPACE}/wfs?"
+        f"service=WFS&request=GetFeature&typeName={GEOSERVER_WORKSPACE}:{table}"
+    )
+    # print(f"[GeoServer] WMS URL: {wms_link}")
+    # print(f"[GeoServer] WFS URL: {wfs_link}")
+    # print(f"[GeoServer] Reload completed. Layer {table} ready.")
+    openlayer_url = (
+        f"{GEOSERVER_URL}/{GEOSERVER_WORKSPACE}/wms?"
+        f"service=WMS"
+        f"&version=1.1.0"
+        f"&request=GetMap"
+        f"&layers={GEOSERVER_WORKSPACE}:{table}"
+        f"&styles="
+        f"&bbox=110.89528623700005%2C-8.780412043999945%2C116.26994997700001%2C-5.042971664999925"
+        f"&width=768"
+        f"&height=384"
+        f"&srs=EPSG:4326"
+        f"&format=application/openlayers"
+    )
+
+    return {
+        "table": table,
+        "style": style_name,
+        "wms_url": wms_link,
+        "wfs_url": wfs_link,
+        "layer_url": openlayer_url
+    }
+
+
+
+
+
+# use default style
+# def publish_layer_to_geoserver(table: str):
+
+#     print(f"[GeoServer] Publish layer: {table}")
+
+#     # ========== 1. Publish Feature Type ==========
+#     ft_url = f"{GEOSERVER_URL}/rest/workspaces/{WORKSPACE}/datastores/{DATASTORE}/featuretypes"
+
+#     payload = {
+#         "featureType": {
+#             "name": table,
+#             "nativeName": table,
+#             "enabled": True
+#         }
+#     }
+
+#     requests.post(
+#         ft_url,
+#         auth=(GEOSERVER_USER, GEOSERVER_PASS),
+#         headers={"Content-Type": "application/json"},
+#         data=json.dumps(payload)
+#     )
+
+#     # ===================================================
+#     # 2. Tentukan SLD file (prioritas table.sld → fallback default)
+#     # ===================================================
+#     table_sld = SLD_DIR / f"{table}.sld"
+#     default_sld = SLD_DIR / "default_style.sld"
+
+#     if table_sld.exists():
+#         chosen_sld = table_sld
+#         delete_after = True
+#         style_name = table  # pakai nama style sama dengan layer
+#         print(f"[SLD] Menggunakan SLD khusus: {chosen_sld}")
+#     else:
+#         chosen_sld = default_sld
+#         delete_after = False
+#         style_name = "default_style"
+#         print(f"[SLD] Menggunakan default SLD: {chosen_sld}")
+
+#     # ==========================================
+#     # 3. Upload SLD
+#     # ==========================================
+#     style_url = f"{GEOSERVER_URL}/rest/styles"
+
+#     with open(chosen_sld, "rb") as sld:
+#         requests.post(
+#             f"{style_url}?name={style_name}&workspace={WORKSPACE}",
+#             auth=(GEOSERVER_USER, GEOSERVER_PASS),
+#             headers={"Content-Type": "application/vnd.ogc.sld+xml"},
+#             data=sld.read()
+#         )
+
+#     print(f"[GeoServer] SLD uploaded: {style_name}")
+
+#     # ==========================================
+#     # 4. Apply SLD ke layer
+#     # ==========================================
+#     layer_url = f"{GEOSERVER_URL}/rest/layers/{WORKSPACE}:{table}"
+
+#     payload = {
+#         "layer": {
+#             "defaultStyle": {
+#                 "name": style_name,
+#                 "workspace": WORKSPACE
+#             },
+#             "enabled": True
+#         }
+#     }
+
+#     requests.put(
+#         layer_url,
+#         auth=(GEOSERVER_USER, GEOSERVER_PASS),
+#         headers={"Content-Type": "application/json"},
+#         data=json.dumps(payload)
+#     )
+
+#     print(f"[GeoServer] Style '{style_name}' applied to layer '{table}'")
+
+#     # ==========================================
+#     # 5. Delete table.sld jika ada
+#     # ==========================================
+#     if delete_after:
+#         table_sld.unlink()
+#         print(f"[CLEANUP] File SLD '{table}.sld' dihapus")
+
+#     # ====================================================
+#     # 6. Reload GeoServer (opsional tapi aman)
+#     # ====================================================
+#     requests.post(
+#         f"{GEOSERVER_URL}/rest/reload",
+#         auth=(GEOSERVER_USER, GEOSERVER_PASS)
+#     )
+
+#     # ====================================================
+#     # 7. Generate GeoServer WMS/WFS link untuk GeoNetwork
+#     # ====================================================
+
+#     wms_link = (
+#         f"{GEOSERVER_URL}/{WORKSPACE}/wms?"
+#         f"service=WMS&request=GetMap&layers={WORKSPACE}:{table}"
+#     )
+
+#     wfs_link = (
+#         f"{GEOSERVER_URL}/{WORKSPACE}/wfs?"
+#         f"service=WFS&request=GetFeature&typeName={WORKSPACE}:{table}"
+#     )
+
+#     print(f"[GeoServer] WMS URL: {wms_link}")
+#     print(f"[GeoServer] WFS URL: {wfs_link}")
+
+#     return {
+#         "table": table,
+#         "style": style_name,
+#         "wms_url": wms_link,
+#         "wfs_url": wfs_link
+#     }
+
+
--- a/app/mapset_pipeline/core/readers/init.py
+++ b/app/mapset_pipeline/core/readers/init.py
@ -0,0 +1,18 @@
+# Import fungsi utama dari masing-masing file reader
+# (Titik '.' berarti import dari folder yang sama)
+
+from .reader_csv import read_csv
+from .reader_shp import read_shp
+from .reader_gdb import read_gdb
+from .reader_mpk import read_mpk
+from .reader_pdf import read_pdf, convert_df
+
+# Opsional: Mendefinisikan apa yang akan ter-import jika orang mengetik "from ... import *"
+__all__ = [
+    "read_csv",
+    "read_shp",
+    "read_gdb",
+    "read_mpk",
+    "read_pdf",
+    "convert_df"
+]
--- a/app/mapset_pipeline/core/readers/reader_csv.py
+++ b/app/mapset_pipeline/core/readers/reader_csv.py
@ -0,0 +1,228 @@
+import pandas as pd
+import re
+import csv
+import os
+
+def detect_header_line(path, max_rows=10):
+    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
+        lines = [next(f) for _ in range(max_rows)]
+    header_line_idx = 0
+    best_score = -1
+    for i, line in enumerate(lines):
+        cells = re.split(r'[;,|\t]', line.strip())
+        alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
+        digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
+        score = alpha_ratio - digit_ratio
+        if score > best_score:
+            best_score = score
+            header_line_idx = i
+    return header_line_idx
+
+def detect_delimiter(path, sample_size=2048):
+    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
+        sample = f.read(sample_size)
+    sniffer = csv.Sniffer()
+    try:
+        dialect = sniffer.sniff(sample)
+        return dialect.delimiter
+    except Exception:
+        for delim in [',', ';', '\t', '|']:
+            if delim in sample:
+                return delim
+        return ','
+
+
+# def read_csv(path: str, sheet: str = None):
+#     ext = os.path.splitext(path)[1].lower()
+
+#     try:
+#         if ext in ['.csv']:
+#             header_line = detect_header_line(path)
+#             delimiter = detect_delimiter(path)
+#             print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
+
+#             df = pd.read_csv(
+#                 path,
+#                 header=header_line,
+#                 sep=delimiter,
+#                 encoding='utf-8',
+#                 low_memory=False,
+#                 thousands=','
+#             )
+
+#         elif ext in ['.xlsx', '.xls']:
+#             print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
+#             xls = pd.ExcelFile(path)
+#             print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
+
+#             if sheet:
+#                 if sheet not in xls.sheet_names:
+#                     raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}")
+#                 print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
+#                 df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str)
+#                 df = df.dropna(how='all').dropna(axis=1, how='all')
+
+#             else:
+#                 print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...")
+#                 best_sheet = None
+#                 best_score = -1
+#                 best_df = None
+
+#                 for sheet_name in xls.sheet_names:
+#                     try:
+#                         temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
+#                         temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all')
+
+#                         if len(temp_df) == 0 or len(temp_df.columns) < 2:
+#                             continue
+
+#                         # hitung skor relevansi
+#                         text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1)
+#                         row_score = len(temp_df)
+#                         score = (row_score * 0.7) + (text_ratio * 100)
+
+#                         if score > best_score:
+#                             best_score = score
+#                             best_sheet = sheet_name
+#                             best_df = temp_df
+
+#                     except Exception as e:
+#                         print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
+#                         continue
+
+#                 if best_df is not None:
+#                     print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
+#                     df = best_df
+#                 else:
+#                     raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
+
+#             for col in df.columns:
+#                 if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
+#                     df[col] = df[col].astype(str).str.replace(',', '', regex=False)
+#                     df[col] = pd.to_numeric(df[col], errors='ignore')
+
+#         else:
+#             raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
+
+#     except Exception as e:
+#         print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
+#         df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
+
+#     df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
+#     df.columns = [str(c).strip() for c in df.columns]
+#     df = df.dropna(how='all')
+
+#     return df
+
+
+
+def read_csv(path: str, sheet: str = None):
+    ext = os.path.splitext(path)[1].lower()
+    df = pd.DataFrame() # Inisialisasi default
+
+    try:
+        # --- BLOK PEMBACAAN FILE ---
+        if ext in ['.csv']:
+            header_line = detect_header_line(path)
+            delimiter = detect_delimiter(path)
+            print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
+
+            df = pd.read_csv(
+                path,
+                header=header_line,
+                sep=delimiter,
+                encoding='utf-8',
+                low_memory=False,
+                thousands=','
+            )
+
+        elif ext in ['.xlsx', '.xls']:
+            print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
+            xls = pd.ExcelFile(path, engine='openpyxl') # Pakai engine openpyxl
+            print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
+
+            if sheet:
+                if sheet not in xls.sheet_names:
+                    raise ValueError(f"Sheet '{sheet}' tidak ditemukan.")
+                print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
+                # Tambahkan engine='openpyxl'
+                df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str, engine='openpyxl')
+                df = df.dropna(how='all').dropna(axis=1, how='all')
+
+            else:
+                # Logika pencarian sheet terbaik (tidak berubah, hanya indentasi)
+                print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...")
+                best_sheet = None
+                best_score = -1
+                best_df = None
+
+                for sheet_name in xls.sheet_names:
+                    try:
+                        temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str, engine='openpyxl')
+                        temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all')
+
+                        if len(temp_df) == 0 or len(temp_df.columns) < 2:
+                            continue
+
+                        text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1)
+                        row_score = len(temp_df)
+                        score = (row_score * 0.7) + (text_ratio * 100)
+
+                        if score > best_score:
+                            best_score = score
+                            best_sheet = sheet_name
+                            best_df = temp_df
+                    except Exception as e:
+                        print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
+                        continue
+
+                if best_df is not None:
+                    print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
+                    df = best_df
+                else:
+                    raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
+
+        else:
+            raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
+
+        # --- BLOK PEMBERSIHAN (Dilakukan setelah file sukses terbaca) ---
+        # Kita bungkus ini agar error konversi angka TIDAK menggagalkan pembacaan file
+        if not df.empty:
+            df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
+            df.columns = [str(c).strip() for c in df.columns]
+            df = df.dropna(how='all')
+
+            # Konversi Angka yang Lebih Aman
+            for col in df.columns:
+                try:
+                    # Cek apakah kolom terlihat seperti angka
+                    if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
+                        # Bersihkan koma
+                        clean_col = df[col].astype(str).str.replace(',', '', regex=False)
+                        # Gunakan errors='coerce' agar jika ada error value (NaN/REF), dia jadi NaN, bukan crash
+                        df[col] = pd.to_numeric(clean_col, errors='coerce')
+                except Exception as ex:
+                    # Jika konversi gagal, biarkan sebagai string/object dan lanjut ke kolom berikutnya
+                    print(f"[WARN] Gagal konversi numerik pada kolom '{col}': {ex}")
+                    pass
+
+        return df
+
+    except Exception as e:
+        # --- ERROR HANDLING YANG BENAR ---
+        print(f"[WARN] Gagal membaca file utama ({e}).")
+        
+        # Hanya lakukan fallback CSV jika file aslinya MEMANG CSV (atau txt)
+        # Jangan paksa baca .xlsx pakai read_csv
+        if ext in ['.csv', '.txt']:
+            print("[INFO] Mencoba fallback ke default CSV reader...")
+            try:
+                return pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
+            except Exception as e2:
+                print(f"[ERROR] Fallback CSV juga gagal: {e2}")
+        
+        # Jika file Excel gagal dibaca, return DataFrame kosong atau raise error
+        print("[ERROR] Tidak dapat memulihkan pembacaan file Excel.")
+        return pd.DataFrame()
+
+
--- a/app/mapset_pipeline/core/readers/reader_gdb.py
+++ b/app/mapset_pipeline/core/readers/reader_gdb.py
@ -0,0 +1,75 @@
+import geopandas as gpd
+import fiona
+import zipfile
+import tempfile
+import os
+import shutil
+
+def read_gdb(zip_path: str, layer: str = None):
+    if not zip_path.lower().endswith(".zip"):
+        raise ValueError("File GDB harus berupa ZIP yang berisi folder .gdb atau file .gdbtable")
+
+    tmpdir = tempfile.mkdtemp()
+    with zipfile.ZipFile(zip_path, "r") as zip_ref:
+        zip_ref.extractall(tmpdir)
+
+    macosx_path = os.path.join(tmpdir, "__MACOSX")
+    if os.path.exists(macosx_path):
+        shutil.rmtree(macosx_path)
+
+    gdb_folders = []
+    for root, dirs, _ in os.walk(tmpdir):
+        for d in dirs:
+            if d.lower().endswith(".gdb"):
+                gdb_folders.append(os.path.join(root, d))
+
+    if not gdb_folders:
+        gdbtable_files = []
+        for root, _, files in os.walk(tmpdir):
+            for f in files:
+                if f.lower().endswith(".gdbtable"):
+                    gdbtable_files.append(os.path.join(root, f))
+
+        if gdbtable_files:
+            first_folder = os.path.dirname(gdbtable_files[0])
+            base_name = os.path.basename(first_folder)
+            gdb_folder_path = os.path.join(tmpdir, f"{base_name}.gdb")
+
+            os.makedirs(gdb_folder_path, exist_ok=True)
+
+            for fpath in os.listdir(first_folder):
+                if ".gdb" in fpath.lower():
+                    shutil.move(os.path.join(first_folder, fpath), os.path.join(gdb_folder_path, fpath))
+
+            gdb_folders.append(gdb_folder_path)
+            # print(f"[INFO] Rebuilt GDB folder from nested structure: {gdb_folder_path}")
+        else:
+            # print("[DEBUG] Isi ZIP:", os.listdir(tmpdir))
+            shutil.rmtree(tmpdir)
+            raise ValueError("Tidak ditemukan folder .gdb atau file .gdbtable di dalam ZIP")
+
+    gdb_path = gdb_folders[0]
+
+    layers = fiona.listlayers(gdb_path)
+    # print(f"[INFO] Layer tersedia: {layers}")
+
+    chosen_layer = layer or (layers[0] if layers else None)
+    if not chosen_layer:
+        shutil.rmtree(tmpdir)
+        raise ValueError("Tidak ada layer GDB yang bisa dibaca.")
+
+    print(f"[DEBUG] Membaca layer: {chosen_layer}")
+
+    try:
+        gdf = gpd.read_file(gdb_path, layer=chosen_layer)
+    except Exception as e:
+        shutil.rmtree(tmpdir)
+        raise ValueError(f"Gagal membaca layer dari GDB: {e}")
+
+    if gdf.crs is None:
+        # print("[WARN] CRS tidak terdeteksi, diasumsikan EPSG:4326")
+        gdf.set_crs("EPSG:4326", inplace=True)
+
+
+    shutil.rmtree(tmpdir)
+    return gdf
--- a/app/mapset_pipeline/core/readers/reader_mpk.py
+++ b/app/mapset_pipeline/core/readers/reader_mpk.py
@ -0,0 +1,72 @@
+import os
+import tempfile
+import json
+from io import BytesIO
+import geopandas as gpd
+from py7zr import SevenZipFile
+import pyogrio
+
+
+def find_data_source(extract_dir: str):
+    """
+    Cari data sumber (.gdb atau .shp) di dalam folder hasil ekstrak.
+    """
+    for root, dirs, _ in os.walk(extract_dir):
+        for d in dirs:
+            if d.lower().endswith(".gdb"):
+                return os.path.join(root, d)
+
+    for root, _, files in os.walk(extract_dir):
+        for f in files:
+            if f.lower().endswith(".shp"):
+                return os.path.join(root, f)
+
+    raise ValueError("Tidak ditemukan data source yang didukung (.gdb atau .shp).")
+
+
+def get_main_layer(gdb_path: str):
+    """
+    Ambil nama layer utama dari geodatabase (.gdb).
+    """
+    try:
+        layers = pyogrio.list_layers(gdb_path)
+        for layer in layers:
+            if not layer[0].lower().endswith("__attach"):
+                return layer[0]
+        if layers:
+            return layers[0][0]
+        raise ValueError(f"Tidak ada layer utama yang valid di {gdb_path}")
+    except Exception as e:
+        raise ValueError(f"Gagal membaca daftar layer GDB: {e}")
+
+
+def read_mpk(path: str):
+    mpk_bytes = None
+    with open(path, "rb") as f:
+        mpk_bytes = f.read()
+        
+    if not mpk_bytes:
+        raise ValueError("File MPK kosong atau tidak valid.")
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        try:
+            with SevenZipFile(BytesIO(mpk_bytes), mode="r") as z:
+                z.extractall(path=tempdir)
+        except Exception as e:
+            raise ValueError(f"File MPK rusak atau tidak valid: {e}")
+
+        src_path = find_data_source(tempdir)
+
+        if src_path.lower().endswith(".gdb"):
+            layer_name = get_main_layer(src_path)
+            gdf = gpd.read_file(src_path, layer=layer_name)
+        else:
+            gdf = gpd.read_file(src_path)
+
+        if gdf.crs is None:
+            raise ValueError("CRS tidak terdeteksi. Pastikan file memiliki informasi proyeksi (.prj).")
+
+        gdf = gdf.to_crs(epsg=4326)
+
+        print(f"[INFO] Berhasil membaca {len(gdf)} fitur")
+        return gdf
--- a/app/mapset_pipeline/core/readers/reader_pdf.py
+++ b/app/mapset_pipeline/core/readers/reader_pdf.py
@ -0,0 +1,288 @@
+import re
+import pdfplumber
+import pandas as pd
+from app.mapset_pipeline.utils.pdf_cleaner import get_number_column_index, get_start_end_number, normalize_number_column, row_ratio, has_mixed_text_and_numbers, is_short_text_row, parse_page_selection, filter_geo_admin_column, cleaning_column
+from services.upload_file.upload_exceptions import PDFReadError
+from utils.logger_config import setup_logger
+
+logger = setup_logger(__name__)
+
+def detect_header_rows(rows):
+    if not rows:
+        return []
+
+    ratios = [row_ratio(r) for r in rows]
+    body_start_index = None
+
+    for i in range(1, len(rows)):
+        row = rows[i]
+        if has_mixed_text_and_numbers(row):
+            body_start_index = i
+            break
+        if ratios[i] > 0.3:
+            body_start_index = i
+            break
+        if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row):
+            body_start_index = i
+            break
+        if ratios[i - 1] == 0 and ratios[i] > 0:
+            body_start_index = i
+            break
+
+    if body_start_index is None:
+        body_start_index = len(rows)
+
+    potential_headers = rows[:body_start_index]
+    body_filtered = rows[body_start_index:]
+    header_filtered = []
+    for idx, row in enumerate(potential_headers):
+        if is_short_text_row(row):
+            if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0:
+                header_filtered.append(row)
+            else:
+                continue
+        else:
+            header_filtered.append(row)
+
+    return header_filtered, body_filtered
+
+
+def merge_multiline_header(header_rows):
+    final_header = []
+    for col in zip(*header_rows):
+        val = next((v for v in reversed(col) if v and str(v).strip()), '')
+        val = str(val).replace('\n', ' ').strip()
+        final_header.append(val)
+    final_header = [v for v in final_header if v not in ['', None]]
+    return final_header
+
+def merge_parsed_table(tables):
+    roots = []
+    fragments = []
+
+    # STEP 1: klasifikasi
+    for table in tables:
+        num_idx = get_number_column_index(table["columns"])
+        if num_idx is None:
+            roots.append(table)
+            continue
+
+        start_no, _ = get_start_end_number(table["rows"], num_idx)
+        if start_no == 1:
+            roots.append(table)
+        else:
+            fragments.append(table)
+
+    # STEP 2: merge fragment ke root
+    for frag in fragments:
+        frag_idx = get_number_column_index(frag["columns"])
+        f_start, _ = get_start_end_number(frag["rows"], frag_idx)
+
+        for root in roots:
+            if root["columns"] != frag["columns"]:
+                continue
+
+            root_idx = get_number_column_index(root["columns"])
+            _, r_end = get_start_end_number(root["rows"], root_idx)
+
+            if f_start == r_end + 1:
+                root["rows"].extend(frag["rows"])
+                break  # fragment hanya boleh nempel ke 1 root
+
+    return roots
+
+
+def read_pdf(path: str, page: str):
+    """
+    Membaca tabel dari file PDF secara semi-otomatis menggunakan `pdfplumber`.
+
+    Alur utama proses:
+    1. **Buka file PDF** menggunakan pdfplumber.
+    2. **Pilih halaman** berdasarkan input `page` (misalnya "1,3-5" untuk halaman 1 dan 3–5).
+    3. **Deteksi tabel** di setiap halaman yang dipilih.
+    4. **Ekstraksi tabel mentah** (list of list) dari setiap halaman.
+    5. **Pisahkan baris header dan body** dengan fungsi `detect_header_rows()`.
+    6. **Gabungkan header multi-baris** (misalnya tabel dengan dua baris judul kolom).
+    7. **Bersihkan body tabel** menggunakan `cleaning_column()`:
+       - Menghapus kolom nomor urut.
+       - Menyesuaikan jumlah kolom dengan header.
+    8. **Gabungkan hasil akhir** ke dalam format JSON dengan struktur:
+       {
+           "title": <nomor tabel>,
+           "columns": [...],
+           "rows": [...]
+       }
+    9. **Filter tambahan** dengan `filter_geo_admin_column()` (khusus metadata geospasial).
+    10. **Kembalikan hasil** berupa list JSON siap dikirim ke frontend API.
+
+    Args:
+        path (str): Lokasi file PDF yang akan dibaca.
+        page (str): Nomor halaman atau rentang halaman, contoh: "1", "2-4", "1,3-5".
+
+    Returns:
+        list[dict]: Daftar tabel hasil ekstraksi dengan struktur kolom dan baris.
+
+    Raises:
+        PDFReadError: Jika terjadi kesalahan saat membaca atau parsing PDF.
+    """
+    # try:
+    #     pdf_path = path
+    #     selectedPage = page if page else "1"
+    #     tables_data = []
+
+    #     with pdfplumber.open(pdf_path) as pdf:
+    #         total_pages = len(pdf.pages)
+    #         selected_pages = parse_page_selection(selectedPage, total_pages)
+
+    #         logger.info(f"[INFO] Total Halaman PDF: {total_pages}")
+    #         logger.info(f"[INFO] Total Halaman yang dipilih: {len(selected_pages)}")
+    #         logger.info(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
+
+    #         for page_num in selected_pages:
+    #             pdf_page = pdf.pages[page_num - 1]
+    #             tables = pdf_page.find_tables()
+    #             logger.info(f"\n\n[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
+
+    #             # pembacaan title ini tidak valid untuk halaman lanscape
+    #             # for line in pdf_page.extract_text_lines():
+    #             #     if line['top'] > tables[0].bbox[1]:
+    #             #         break
+    #             #     previous_line = line
+    #             #     print('[TITLE]', previous_line['text'])
+
+    #             for i, t in enumerate(tables, start=1):
+    #                 table = t.extract()
+    #                 if len(table) > 2:
+    #                     print(f"[TBL] tabel : {i} - halaman {page_num}")
+    #                     tables_data.append(table)
+
+    #     logger.info(f"\nTotal tabel terbaca: {len(tables_data)}\n")
+
+    #     header_only, body_only = [], []
+    #     for tbl in tables_data:
+    #         head, body = detect_header_rows(tbl)
+    #         header_only.append(head)
+    #         body_only.append(body)
+
+    #     clean_header = [merge_multiline_header(h) for h in header_only]
+    #     clean_body = []
+
+    #     for i, raw_body in enumerate(body_only):
+    #         con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
+    #         cleaned = cleaning_column(clean_header[i], [con_body])
+    #         clean_body.append(cleaned[0])
+
+    #     parsed = []
+    #     for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
+    #         parsed.append({
+    #             "title": str(i),
+    #             "columns": cols,
+    #             "rows": rows
+    #         })
+
+    #     # =================================================================
+
+    #     clean_parsed = filter_geo_admin_column(parsed)
+    #     merge_parsed = merge_parsed_table(clean_parsed)
+
+    #     logger.info(f"\nTotal tabel valid: {len(merge_parsed)}\n")
+
+    #     ordered_tables = [normalize_number_column(t) for t in merge_parsed]
+    #     return ordered_tables
+
+    # except Exception as e:
+    #     raise PDFReadError(f"Gagal membaca PDF: {e}", code=422)
+
+    try:
+        pdf_path = path
+        selectedPage = page if page else "1"
+        tables_data = []
+
+        with pdfplumber.open(pdf_path) as pdf:
+            total_pages = len(pdf.pages)
+            selected_pages = parse_page_selection(selectedPage, total_pages)
+
+            logger.info(f"[INFO] Total Halaman PDF: {total_pages}")
+            logger.info(f"[INFO] Total Halaman yang dipilih: {len(selected_pages)}")
+            logger.info(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
+
+            for page_num in selected_pages:
+                pdf_page = pdf.pages[page_num - 1]
+                tables = pdf_page.find_tables()
+                logger.info(f"\n\n[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
+
+                # pembacaan title ini tidak valid untuk halaman lanscape
+                # for line in pdf_page.extract_text_lines():
+                #     if line['top'] > tables[0].bbox[1]:
+                #         break
+                #     previous_line = line
+                #     print('[TITLE]', previous_line['text'])
+
+                for i, t in enumerate(tables, start=1):
+                    table = t.extract()
+                    if len(table) > 2:
+                        print(f"[TBL] tabel : {i} - halaman {page_num}")
+                        tables_data.append({"page": f"halaman {page_num} - {i}", "table": table})
+
+        logger.info(f"\nTotal tabel terbaca: {len(tables_data)}\n")
+
+        header_only, body_only, page_info = [], [], []
+        for tbl in tables_data:
+            head, body = detect_header_rows(tbl["table"])
+            header_only.append(head)
+            body_only.append(body)
+            page_info.append(tbl["page"])
+
+        clean_header = [merge_multiline_header(h) for h in header_only]
+        clean_body = []
+
+        for i, raw_body in enumerate(body_only):
+            con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
+            cleaned = cleaning_column(clean_header[i], [con_body])
+            clean_body.append(cleaned[0])
+
+        parsed = []
+        for i, (cols, rows, page) in enumerate(zip(clean_header, clean_body, page_info), start=1):
+            parsed.append({
+                "title": page,
+                "columns": cols,
+                "rows": rows
+            })
+
+        # =================================================================
+
+        clean_parsed = filter_geo_admin_column(parsed)
+        merge_parsed = merge_parsed_table(clean_parsed)
+
+        logger.info(f"\nTotal tabel valid: {len(merge_parsed)}\n")
+
+        ordered_tables = [normalize_number_column(t) for t in merge_parsed]
+        return ordered_tables
+
+    except Exception as e:
+        raise PDFReadError(f"Gagal membaca PDF: {e}", code=422)
+
+
+def convert_df(payload):
+    try:
+        if "columns" not in payload or "rows" not in payload:
+            raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.")
+
+        if not isinstance(payload["columns"], list):
+            raise TypeError("'columns' harus berupa list.")
+        if not isinstance(payload["rows"], list):
+            raise TypeError("'rows' harus berupa list.")
+
+        for i, row in enumerate(payload["rows"]):
+            if len(row) != len(payload["columns"]):
+                raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.")
+
+        df = pd.DataFrame(payload["rows"], columns=payload["columns"])
+
+        if "title" in payload:
+            df.attrs["title"] = payload["title"]
+
+        return df
+
+    except Exception as e:
+        raise PDFReadError(f"Gagal konversi payload ke DataFrame: {e}", code=400)
--- a/app/mapset_pipeline/core/readers/reader_shp.py
+++ b/app/mapset_pipeline/core/readers/reader_shp.py
@ -0,0 +1,60 @@
+import geopandas as gpd
+import fiona
+import zipfile
+import tempfile
+import os
+import shutil
+from shapely.geometry import shape
+
+def read_shp(path: str):
+    if not path:
+        raise ValueError("Path shapefile tidak boleh kosong.")
+
+    tmpdir = None
+    shp_path = None
+
+    if path.lower().endswith(".zip"):
+        tmpdir = tempfile.mkdtemp()
+        with zipfile.ZipFile(path, "r") as zip_ref:
+            zip_ref.extractall(tmpdir)
+
+        shp_files = []
+        for root, _, files in os.walk(tmpdir):
+            for f in files:
+                if f.lower().endswith(".shp"):
+                    shp_files.append(os.path.join(root, f))
+
+        if not shp_files:
+            raise ValueError("Tidak ditemukan file .shp di dalam ZIP.")
+        shp_path = shp_files[0]
+        print(f"[DEBUG] Membaca shapefile: {os.path.basename(shp_path)}")
+
+    else:
+        shp_path = path
+
+    try:
+        gdf = gpd.read_file(shp_path)
+    except Exception as e:
+        raise ValueError(f"Gagal membaca shapefile: {e}")
+
+    if "geometry" not in gdf.columns or gdf.geometry.is_empty.all():
+        print("[WARN] Geometry kosong. Mencoba membangun ulang dari fitur mentah...")
+
+        with fiona.open(shp_path) as src:
+            features = []
+            for feat in src:
+                geom = shape(feat["geometry"]) if feat["geometry"] else None
+                props = feat["properties"]
+                props["geometry"] = geom
+                features.append(props)
+
+            gdf = gpd.GeoDataFrame(features, geometry="geometry", crs=src.crs)
+
+    if gdf.crs is None:
+        # print("[WARN] CRS tidak terdeteksi. Diasumsikan EPSG:4326")
+        gdf.set_crs("EPSG:4326", inplace=True)
+
+    if tmpdir and os.path.exists(tmpdir):
+        shutil.rmtree(tmpdir)
+
+    return gdf
--- a/app/mapset_pipeline/data/repository.py
+++ b/app/mapset_pipeline/data/repository.py
@ -0,0 +1,259 @@
+import os
+import json
+import asyncio
+import pandas as pd
+from shapely import wkt, wkb
+from shapely.geometry import MultiPolygon, MultiLineString
+from sqlalchemy import text
+from sqlalchemy.exc import SQLAlchemyError
+
+# Import koneksi database Anda
+from database.connection import engine
+from app.mapset_pipeline.utils.formatters import str_to_date
+
+
+async def generate_unique_table_name(base_name: str) -> str:
+    """Generate nama tabel unik, menambahkan suffix angka jika sudah ada."""
+    base_name = base_name.lower().replace(" ", "_").replace("-", "_")
+    table_name = base_name
+    counter = 2
+
+    async with engine.connect() as conn:
+        while True:
+            # Cek keberadaan tabel di schema public (atau default search path)
+            result = await conn.execute(
+                text("SELECT to_regclass(:tname)"), 
+                {"tname": table_name}
+            )
+            exists = result.scalar()
+
+            if not exists:
+                return table_name
+
+            table_name = f"{base_name}_{counter}"
+            counter += 1
+
+
+async def insert_parquet_to_postgis(filename: str, table_name: str):
+    """
+    Membaca file parquet sementara, membersihkan data, dan melakukan COPY
+    ke PostGIS menggunakan asyncpg pool untuk performa tinggi.
+    """
+    from main import db_pool
+    file_path = os.path.join("tmp", filename)
+    
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File temp {file_path} tidak ditemukan")
+
+    try:
+        loop = asyncio.get_running_loop()
+        # Baca parquet (CPU bound, run in executor jika file sangat besar)
+        df = await loop.run_in_executor(None, pd.read_parquet, file_path)
+
+        # 1. CLEANING NAMA KOLOM
+        df.columns = [str(col).strip().upper() for col in df.columns]
+
+        # Standarisasi kolom GEOM
+        if "GEOM" in df.columns:
+            df.rename(columns={"GEOM": "GEOM"}, inplace=True)
+            
+        if "GEOM" not in df.columns:
+            raise ValueError("Kolom GEOM tidak ditemukan dalam Parquet")
+
+        # 2. PREPARE DATA ROWS
+        clean_rows = []
+        geom_types = set()
+        
+        # Atribut selain GEOM
+        attr_columns = [col for col in df.columns if col != "GEOM"]
+        
+        for row in df.itertuples(index=False):
+            # --- Handle GEOM ---
+            raw_geom = getattr(row, "GEOM", None)
+            if not raw_geom: continue
+
+            try:
+                geom = None
+                if isinstance(raw_geom, str):
+                    geom = wkt.loads(raw_geom)
+                elif isinstance(raw_geom, bytes):
+                    geom = wkb.loads(raw_geom)
+                
+                if not geom: continue
+                
+                # Fix Invalid Geometry
+                if not geom.is_valid: 
+                    geom = geom.buffer(0)
+                
+                # Force Multi-Geometry agar seragam
+                gtype = geom.geom_type.upper()
+                if gtype == "POLYGON": geom = MultiPolygon([geom])
+                elif gtype == "LINESTRING": geom = MultiLineString([geom])
+                
+                geom_types.add(geom.geom_type)
+                
+                # Convert ke EWKT (SRID 4326)
+                ewkt = f"SRID=4326;{geom.wkt}"
+                
+            except Exception:
+                continue # Skip baris dengan geom rusak
+
+            # --- Handle Attributes (FORCE STRING) ---
+            row_data = []
+            for col in attr_columns:
+                val = getattr(row, col, None)
+                if val is not None:
+                    row_data.append(str(val))
+                else:
+                    row_data.append(None)
+            
+            row_data.append(ewkt)
+            clean_rows.append(tuple(row_data))
+
+        if not clean_rows:
+            raise ValueError("Data valid kosong setelah pemrosesan geometry")
+
+        # 3. DATABASE OPERATIONS
+        final_geom_type = list(geom_types)[0].upper() if geom_types else "GEOM"
+        if "MULTI" not in final_geom_type and final_geom_type != "GEOM":
+             final_geom_type = "MULTI" + final_geom_type
+
+        # A. CREATE TABLE
+        col_defs = [f'"{col}" TEXT' for col in attr_columns] # Semua atribut jadi TEXT dulu agar aman
+        
+        create_sql = f"""
+            CREATE TABLE {table_name} (
+                _id SERIAL PRIMARY KEY,
+                {', '.join(col_defs)},
+                geom TEXT
+            );
+        """
+
+        async with db_pool.acquire() as conn:
+            # Create Table
+            await conn.execute(create_sql)
+            
+            # B. COPY Data (Bulk Insert)
+            target_cols = attr_columns + ['geom']
+            # asyncpg otomatis meng-quote nama kolom
+            await conn.copy_records_to_table(
+                table_name,
+                records=clean_rows,
+                columns=target_cols
+            )
+            
+            # C. ALTER COLUMN GEOMETRY & INDEX
+            alter_sql = f"""
+                ALTER TABLE {table_name} 
+                ALTER COLUMN geom TYPE geometry({final_geom_type}, 4326) 
+                USING ST_Force2D(geom::geometry)::geometry({final_geom_type}, 4326);
+                
+                CREATE INDEX idx_{table_name}_geom ON {table_name} USING GIST (geom);
+            """
+            await conn.execute(alter_sql)
+
+        print(f"[SUCCESS] Upload {len(clean_rows)} baris ke tabel {table_name}.")
+        
+        # Hapus file temp setelah sukses
+        try:
+            os.remove(file_path)
+        except OSError:
+            pass
+
+        return {
+            "table_name": table_name,
+            "row_count": len(clean_rows),
+            "geom_type": final_geom_type
+        }
+
+    except Exception as e:
+        print(f"[ERROR] Processing parquet to DB: {e}")
+        raise e
+
+
+async def save_author_metadata(payload_author: dict, table_name: str, dataset_title: str, 
+                               geom_types: list, row_count: int, user_id: int):
+    """
+    Menyimpan metadata author dan informasi dataset ke tabel backend.author_metadata.
+    """
+    query = text("""
+        INSERT INTO backend.author_metadata (
+            table_title,
+            dataset_title,
+            dataset_abstract,
+            keywords,
+            topic_category,
+            date_created,
+            dataset_status,
+            organization_name,
+            contact_person_name,
+            contact_email,
+            contact_phone,
+            geom_type,
+            user_id,
+            process,
+            geometry_count
+        ) VALUES (
+            :table_title,
+            :dataset_title,
+            :dataset_abstract,
+            :keywords,
+            :topic_category,
+            :date_created,
+            :dataset_status,
+            :organization_name,
+            :contact_person_name,
+            :contact_email,
+            :contact_phone,
+            :geom_type,
+            :user_id,
+            :process,
+            :geometry_count
+        )
+    """)
+
+    params = {
+        "table_title": table_name,
+        "dataset_title": dataset_title,
+        "dataset_abstract": payload_author.get("abstract"),
+        "keywords": payload_author.get("keywords"),
+        "topic_category": ", ".join(payload_author.get("topicCategory", [])),
+        "date_created": str_to_date(payload_author.get("dateCreated")),
+        "dataset_status": payload_author.get("status"),
+        "organization_name": payload_author.get("organization"),
+        "contact_person_name": payload_author.get("contactName"),
+        "contact_email": payload_author.get("contactEmail"),
+        "contact_phone": payload_author.get("contactPhone"),
+        "geom_type": json.dumps(geom_types),
+        "user_id": user_id,
+        "process": 'CLEANSING',
+        "geometry_count": row_count
+    }
+
+    async with engine.begin() as conn:
+        await conn.execute(query, params)
+
+
+async def call_cleansing_procedure(table_name: str):
+    """
+    Menjalankan stored procedure cleansing geometry di database.
+    """
+    try:
+        print(f"[INFO] Memulai cleansing database untuk tabel: {table_name}")
+        
+        async with engine.begin() as conn:
+            # Menggunakan parameter binding yang aman
+            await conn.execute(
+                text("CALL pr_cleansing_satupeta_polygon(:table_name, NULL);"),
+                {"table_name": table_name}
+            )
+            
+        print(f"[SUCCESS] Cleansing selesai untuk tabel: {table_name}")
+        return "done"
+
+    except SQLAlchemyError as e:
+        print(f"[ERROR] Cleansing database gagal: {e}")
+        # Kita raise error agar Service tahu kalau proses ini gagal
+        raise RuntimeError(f"Database cleansing failed: {str(e)}")
+
+
--- a/app/mapset_pipeline/service.py
+++ b/app/mapset_pipeline/service.py
@ -0,0 +1,286 @@
+import os
+import shutil
+import pandas as pd
+from fastapi import UploadFile, HTTPException
+from typing import Optional
+
+# --- Internal Modules ---
+from .api.schemas import UploadRequest, PdfRequest
+from .core.processing.analyzer import analyze_and_clean_dataframe, publish_mapset
+from .core.readers import (
+    read_csv, 
+    read_shp, 
+    read_gdb, 
+    read_mpk, 
+    read_pdf, 
+    convert_df
+)
+from .data.repository import (
+    generate_unique_table_name, 
+    insert_parquet_to_postgis, 
+    save_author_metadata,
+    call_cleansing_procedure
+)
+
+from app.mapset_pipeline.utils.file_ops import (
+    detect_zip_type, 
+    generate_job_id,
+)
+from app.mapset_pipeline.utils.formatters import (
+    save_xml_to_sld,
+)
+
+# --- Legacy/External Modules (Sesuai kode asli Anda) ---
+from app.core.config import UPLOAD_FOLDER, MAX_FILE_MB, GEONETWORK_URL
+from utils.logger_config import log_activity
+
+# from api.routers.datasets_router import (
+#     upload_to_main
+# )
+
+async def handle_file_analysis(
+    file: UploadFile, 
+    page: Optional[str] = "", 
+    sheet: Optional[str] = "", 
+    fileDesc: Optional[str] = ""
+):
+    """
+    Orchestrator untuk endpoint /upload.
+    1. Simpan file fisik.
+    2. Pilih Reader berdasarkan ekstensi.
+    3. Panggil Processor untuk analisis.
+    4. Bersihkan file fisik.
+    """
+    fname = file.filename
+    ext = os.path.splitext(fname)[1].lower()
+    
+    # 1. Validasi & Simpan File
+    # Membaca file in-memory untuk cek ukuran (hati-hati memory usage untuk file besar)
+    contents = await file.read()
+    size_mb = len(contents) / (1024 * 1024)
+    if size_mb > MAX_FILE_MB:
+        raise HTTPException(status_code=413, detail="Ukuran File Terlalu Besar")
+
+    tmp_path = UPLOAD_FOLDER / fname
+    # Pastikan folder ada
+    os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+    
+    with open(tmp_path, "wb") as f:
+        f.write(contents)
+
+    df = None
+    try:
+        # 2. Routing Reader Berdasarkan Ekstensi
+        print(f"[INFO] Processing file type: {ext}")
+
+        if ext == ".csv":
+            df = read_csv(str(tmp_path))
+        elif ext == ".xlsx":
+            df = read_csv(str(tmp_path), sheet) # Asumsi read_csv handle xlsx juga sesuai kode asli
+        elif ext == ".mpk":
+            df = read_mpk(str(tmp_path))
+        elif ext == ".pdf":
+            # Logic PDF agak unik, bisa return list tabel atau df
+            tbl = read_pdf(tmp_path, page)
+            if len(tbl) == 0:
+                return {
+                    "message": "Tidak ditemukan tabel valid pada halaman yang dipilih",
+                    "tables": {},
+                    "file_type": ext
+                }
+            elif len(tbl) > 1:
+                return {
+                    "message": "File berhasil dibaca, ditemukan banyak tabel.",
+                    "tables": tbl,
+                    "file_type": ext
+                }
+            else:
+                df = convert_df(tbl[0])
+        elif ext == ".zip":
+            zip_type = detect_zip_type(str(tmp_path))
+            if zip_type == "shp":
+                df = read_shp(str(tmp_path))
+            elif zip_type == "gdb":
+                df = read_gdb(str(tmp_path))
+            else:
+                raise HTTPException(status_code=400, detail="ZIP file tidak mengandung SHP / GDB valid.")
+        else:
+            raise HTTPException(status_code=400, detail="Unsupported file type")
+
+        # Cek Dataframe Kosong
+        if df is None or (hasattr(df, "empty") and df.empty):
+             raise HTTPException(status_code=422, detail="File berhasil dibaca, tetapi tidak ditemukan tabel valid")
+
+        # 3. Panggil Processor (Logic Cleaning & Validasi)
+        result_analysis = await analyze_and_clean_dataframe(df, ext, fname, fileDesc)
+        return result_analysis
+
+    except Exception as e:
+        print(f"[ERROR] handle_file_analysis: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+    
+    finally:
+        # 4. Cleanup Uploaded File (Raw File)
+        # Kita hapus file upload asli, tapi file temp parquet (hasil processor) 
+        # tetap hidup sampai frontend mengirim request ingest
+        if tmp_path.exists():
+            try:
+                os.remove(tmp_path)
+            except Exception:
+                pass
+
+
+async def process_pdf_file(payload: PdfRequest):
+    """
+    Helper khusus jika user mengupload PDF dan ingin memilih tabel tertentu.
+    """
+    try:
+        # Convert request body ke DataFrame (sesuai logic reader_pdf)
+        # Kita mock convert_df karena di kode asli import dari reader_pdf
+        # yang mungkin mengharapkan format dict khusus
+        df = convert_df(payload.model_dump())
+        
+        if df is None or (hasattr(df, "empty") and df.empty):
+             raise HTTPException(status_code=422, detail="Tidak ada tabel valid dalam PDF")
+
+        # Reuse logic processor yang sama
+        return await analyze_and_clean_dataframe(
+            df, '.pdf', payload.fileName, payload.fileDesc
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+async def execute_postgis_ingestion(payload: UploadRequest, user_id: int):
+    """
+    Orchestrator untuk endpoint /process-to-postgis.
+    1. Terima data (JSON rows).
+    2. Convert ke Parquet Temporary.
+    3. Upload ke PostGIS (via Repository).
+    4. Simpan Metadata (via Repository).
+    5. Trigger Cleansing & Publishing.
+    6. Logging.
+    """
+    job_id = generate_job_id(str(user_id))
+    
+    try:
+        # 1. Generate Nama Tabel
+        table_name = await generate_unique_table_name(payload.title)
+        
+        # 2. Persiapan Data (JSON -> DataFrame -> Parquet)
+        # Kita perlu save ke parquet karena repository insert_parquet_to_postgis membaca file
+        # Ini juga memisahkan memory load antara API dan DB Process
+        df = pd.DataFrame(payload.rows)
+        
+        # Upper case columns
+        df.columns = [col.upper() for col in df.columns]
+        
+        # Rename Geometry jika perlu (standarisasi input dari frontend)
+        if "GEOMETRY" in df.columns:
+            df.rename(columns={"GEOMETRY": "GEOM"}, inplace=True)
+            
+        # Simpan ke file temp untuk diproses repository
+        temp_parquet_name = f"{job_id}.parquet"
+        temp_parquet_path = os.path.join("tmp", temp_parquet_name)
+        os.makedirs("tmp", exist_ok=True)
+        
+        # Save parquet (gunakan engine pyarrow atau fastparquet)
+        df.to_parquet(temp_parquet_path, index=False)
+
+        # 3. Insert ke PostGIS
+        # Fungsi ini akan membaca file parquet tadi, membersihkan geom, dan copy ke DB
+        db_result = await insert_parquet_to_postgis(temp_parquet_name, table_name)
+        
+        # 4. Simpan Metadata
+        # Ambil list geom type dan row count dari hasil insert DB (lebih akurat)
+        final_geom_types = [db_result['geom_type']] # Disederhanakan jadi list
+        row_count = db_result['row_count']
+        
+        await save_author_metadata(
+            payload_author=payload.author,
+            table_name=table_name,
+            dataset_title=payload.title,
+            geom_types=final_geom_types,
+            row_count=row_count,
+            user_id=user_id
+        )
+
+        # 5. Logging Activity
+        await log_activity(
+            user_id=user_id,
+            action_type="UPLOAD",
+            action_title=f"Upload dataset {table_name}",
+            details={"table_name": table_name, "rows": row_count}
+        )
+
+        # 6. Post-Processing (External APIs)
+        result = {
+            "job_id": job_id,
+            "job_status": "wait",
+            "table_name": table_name,
+            "status": "success",
+            "message": f"Tabel '{table_name}' berhasil dibuat.",
+            "total_rows": row_count,
+            "geometry_type": final_geom_types,
+            "crs": payload.author.get("crs", "EPSG:4326"),
+            "metadata_uuid": ""
+        }
+
+        # Save Style (SLD)
+        save_xml_to_sld(payload.style, job_id)
+
+        # CLEANSING WITH QUERY
+        try:
+            cleansing_status = await call_cleansing_procedure(table_name)
+        except Exception as e:
+            cleansing_status = "failed"
+            print(f"Cleansing warning: {e}")
+        result['job_status'] = cleansing_status
+
+        # Publish Layer (Geoserver/Geonetwork)
+        publish_info = await publish_mapset(table_name, job_id)
+        result['metadata_uuid'] = publish_info.get('uuid', '')
+
+        # 7. Upload to Main Portal (Mapset Integration)
+        mapset_payload = {
+            "name": payload.title,
+            "description": payload.author.get("abstract"),
+            "scale": "1:25000",
+            # ID Hardcoded sesuai kode asli (pertimbangkan pindah ke config/env)
+            'projection_system_id': '0196c746-d1ba-7f1c-9706-5df738679cc7', 
+            "category_id": payload.author.get("mapsetCategory"),
+            "data_status": "sementara",
+            'classification_id': '01968b4b-d3f9-76c9-888c-ee887ac31ce4', 
+            'producer_id': '01968b54-0000-7a67-bd10-975b8923b93e', 
+            "layer_type": final_geom_types[0],
+            'source_id': ['019c03ef-35e1-738b-858d-871dc7d1e4d6'], 
+            "layer_url": publish_info.get('geos_link', ''),
+            "metadata_url": f"{GEONETWORK_URL}/srv/eng/catalog.search#/metadata/{publish_info.get('uuid', '')}",
+            "coverage_level": "provinsi",
+            "coverage_area": "kabupaten",
+            "data_update_period": "Tahunan",
+            "data_version": "2026",
+            "is_popular": False,
+            "is_active": True,
+            'regional_id': '01968b53-a910-7a67-bd10-975b8923b92e', 
+            "notes": "Mapset baru dibuat",
+            "status_validation": "on_verification",
+        }
+        
+        # await upload_to_main(mapset_payload)
+        
+        return result
+
+    except Exception as e:
+        # Error Handling & Logging
+        await log_activity(
+            user_id=user_id,
+            action_type="ERROR",
+            action_title="Upload gagal",
+            details={"error": str(e)}
+        )
+        print(f"[ERROR] execute_postgis_ingestion: {e}")
+        # Re-raise sebagai HTTP Exception agar router mengembalikan 500 yang rapi
+        raise HTTPException(status_code=500, detail=str(e))
+
+
--- a/app/mapset_pipeline/utils/file_ops.py
+++ b/app/mapset_pipeline/utils/file_ops.py
@ -0,0 +1,105 @@
+import os
+import uuid
+import zipfile
+import geopandas as gpd
+from shapely import wkt
+from shapely.errors import ShapelyError
+from datetime import datetime
+
+
+def detect_zip_type(zip_path: str) -> str:
+    with zipfile.ZipFile(zip_path, "r") as zip_ref:
+        files = zip_ref.namelist()
+
+    if any(f.lower().endswith(".gdb/") or ".gdb/" in f.lower() for f in files):
+        return "gdb"
+
+    if any(f.lower().endswith(ext) for ext in [".gdbtable", ".gdbtablx", ".gdbindexes", ".spx"] for f in files):
+        return "gdb"
+
+    if any(f.lower().endswith(".shp") for f in files):
+        return "shp"
+
+    return "unknown"
+
+
+def generate_unique_filename(folder="tmp", ext="parquet", digits=6):
+    os.makedirs(folder, exist_ok=True)
+    while True:
+        file_id = file_id = uuid.uuid4().int
+        filename = f"{folder}/{file_id}.{ext}"
+        
+        if not os.path.exists(filename):
+            return filename
+
+
+def generate_job_id(user_id: str) -> str:
+    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+    return f"{user_id}_{timestamp}"
+
+
+def dataframe_validation(df_input, tmp_file):
+    """
+    Fungsi ini berjalan di thread terpisah (CPU bound).
+    Melakukan validasi, cleaning, dan export ke parquet.
+    """
+    # 1. Copy agar tidak mengubah data asli
+    export_df = df_input.copy()
+
+    # =========================================================================
+    # TAHAP 1: SAFE WKT LOADING
+    # =========================================================================
+    def safe_load_wkt(raw):
+        if not isinstance(raw, str):
+            return None
+        try:
+            return wkt.loads(raw)
+        # 2. GANTI CATCH BLOCK INI
+        # except (WKTReadingError, Exception): <-- LAMA
+        except (ShapelyError, Exception):
+            return None 
+
+    # Terapkan safe load
+    export_df["geom"] = export_df["geometry"].apply(safe_load_wkt)
+
+    # =========================================================================
+    # TAHAP 2: FILTER NULL & INVALID GEOMETRY
+    # =========================================================================
+    # Hapus baris di mana konversi WKT gagal (None)
+    export_df = export_df[export_df["geom"].notnull()]
+    print("df", export_df)
+    if export_df.empty:
+        raise ValueError("Tidak ada data spasial valid yang ditemukan.")
+
+    # Jadikan GeoDataFrame
+    export_df = gpd.GeoDataFrame(export_df, geometry="geom")
+
+    # =========================================================================
+    # TAHAP 3: FIX TOPOLOGY (PENTING!)
+    # =========================================================================
+    # Cek validitas (misal: Polygon yang garisnya menabrak diri sendiri)
+    # buffer(0) adalah trik standar GIS untuk memperbaiki topologi ringan
+    export_df["geom"] = export_df["geom"].apply(
+        lambda g: g.buffer(0) if not g.is_valid else g
+    )
+    
+    # Hapus lagi jika setelah di-fix malah jadi kosong (jarang terjadi, tapi aman)
+    export_df = export_df[~export_df["geom"].is_empty]
+
+    # =========================================================================
+    # TAHAP 4: FINALISASI (CRS & RENAME)
+    # =========================================================================
+    export_df = export_df.drop(columns=["geometry"]) # Buang kolom string WKT lama
+    export_df = export_df.set_crs("EPSG:4326", allow_override=True)
+
+    # Rename kolom atribut ke UPPERCASE, biarkan 'geom' lowercase
+    # .strip() untuk membuang spasi hantu (" ID " -> "ID")
+    export_df = export_df.rename(
+        columns=lambda c: str(c).strip().upper() if c != "geom" else c
+    )
+
+    # Simpan ke Parquet
+    export_df.to_parquet(tmp_file)
+    
+    return len(export_df)
+
--- a/app/mapset_pipeline/utils/formatters.py
+++ b/app/mapset_pipeline/utils/formatters.py
@ -0,0 +1,43 @@
+import os
+import pandas as pd
+import numpy as np
+from shapely.geometry import base as shapely_base
+from shapely.geometry.base import BaseGeometry
+from datetime import datetime
+
+
+def safe_json(value):
+    """Konversi aman untuk semua tipe numpy/pandas/shapely ke tipe JSON-serializable"""
+    if isinstance(value, (np.int64, np.int32)):
+        return int(value)
+    if isinstance(value, (np.float64, np.float32)):
+        return float(value)
+    if isinstance(value, pd.Timestamp):
+        return value.isoformat()
+    if isinstance(value, shapely_base.BaseGeometry):
+        return str(value)  # convert to WKT string
+    if pd.isna(value):
+        return None
+    return value
+
+
+def str_to_date(raw_date: str):
+    if raw_date:
+        try:
+            return datetime.strptime(raw_date, "%Y-%m-%d").date()
+        except Exception as e:
+            print("[WARNING] Tidak bisa parse dateCreated:", e)
+            return None
+
+
+def save_xml_to_sld(xml_string, filename):
+    folder_path = 'style_temp'
+    os.makedirs(folder_path, exist_ok=True)
+
+    file_path = os.path.join(folder_path, f"{filename}.sld")
+
+    with open(file_path, "w", encoding="utf-8") as f:
+        f.write(xml_string)
+
+    return file_path
+
--- a/app/mapset_pipeline/utils/pdf_cleaner.py
+++ b/app/mapset_pipeline/utils/pdf_cleaner.py
@ -0,0 +1,208 @@
+import re
+import itertools
+
+geo_admin_keywords = [
+    'lat', 'lon', 'long', 'latitude', 'longitude', 'koordinat', 'geometry', 'geometri',
+    'desa', 'kelurahan', 'kel', 'kecamatan', 'kabupaten', 'kab', 'kota', 'provinsi',
+    'lokasi', 'region', 'area', 'zone', 'boundary', 'batas'
+]
+
+def normalize_text(text):
+    text = text.lower()
+    text = re.sub(r'[^a-z0-9/ ]+', ' ', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+
+def generate_combined_patterns(keywords):
+    combos = list(itertools.combinations(keywords, 2))
+    patterns = []
+    for a, b in combos:
+        patterns.append(rf'{a}\s*/\s*{b}')
+        patterns.append(rf'{b}\s*/\s*{a}')
+    return patterns
+
+combined_patterns = generate_combined_patterns(geo_admin_keywords)
+
+def contains_geo_admin_keywords(text):
+    text_clean = normalize_text(text)
+    if len(text_clean) < 3:
+        return False 
+    
+    for pattern in combined_patterns:
+        if re.search(pattern, text_clean):
+            return True
+
+    for kw in geo_admin_keywords:
+        if re.search(rf'(^|[\s/_-]){kw}([\s/_-]|$)', text_clean):
+            return True
+    
+    return False
+
+def filter_geo_admin_column(tables):
+    filtered = []
+    for table in tables:
+        found = any(contains_geo_admin_keywords(col) for col in table['columns'])
+        if found:
+            filtered.append(table)
+    return filtered
+
+
+NUMBER_HEADER_KEYWORDS = [
+    "no","no.","nomor","nomor urut","no urut","No","Nomor","No Urut","Index",
+    "ID","Sr No","S/N","SN","Sl No"
+]
+
+def has_number_header(header):
+    header_text = header
+    return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS)
+
+def is_numbering_column(col_values):
+    numeric_like = 0
+    total = 0
+    for v in col_values:
+        if not v or not isinstance(v, str):
+            continue
+        total += 1
+        if re.fullmatch(r"0*\d{1,3}", v.strip()):
+            numeric_like += 1
+    return total > 0 and (numeric_like / total) > 0.6
+
+def is_numeric_value(v):
+    if v is None:
+        return False
+    if isinstance(v, (int, float)):
+        return True
+    if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()):
+        return True
+    return False
+
+def cleaning_column(headers, bodies):
+    cleaned_bodies = []
+
+    for header, body in zip(headers, bodies):
+        if not body:
+            cleaned_bodies.append(body)
+            continue
+
+        header_has_number = has_number_header(header)
+        first_col = [row[0] for row in body if row and len(row) > 0]
+        first_col_is_numbering = is_numbering_column(first_col)
+
+        if not header_has_number and first_col_is_numbering:
+            new_body = []
+            for row in body:
+                if not row:
+                    continue
+                first_val = row[0]
+                if is_numeric_value(first_val) and len(row) > 1:
+                    new_body.append(row[1:])
+                else:
+                    new_body.append(row)
+            body = new_body
+
+        header_len = len(headers)
+        filtered_body = [row for row in body if len(row) == header_len]
+
+        cleaned_bodies.append(filtered_body)
+
+    return cleaned_bodies
+
+def parse_page_selection(selectedPage: str, total_pages: int):
+    if not selectedPage:
+        return list(range(1, total_pages + 1))
+
+    pages = set()
+    parts = re.split(r'[,\s]+', selectedPage.strip())
+
+    for part in parts:
+        if '-' in part:
+            try:
+                start, end = map(int, part.split('-'))
+                pages.update(range(start, end + 1))
+            except ValueError:
+                continue
+        else:
+            try:
+                pages.add(int(part))
+            except ValueError:
+                continue
+
+    valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages]
+    return valid_pages
+
+def is_number(s):
+    if s is None:
+        return False
+    s = str(s).strip().replace(',', '').replace('.', '')
+    return s.isdigit()
+
+def row_ratio(row):
+    non_empty = [c for c in row if c not in (None, '', ' ')]
+    if not non_empty:
+        return 0
+    num_count = sum(is_number(c) for c in non_empty)
+    return num_count / len(non_empty)
+
+def has_mixed_text_and_numbers(row):
+    non_empty = [c for c in row if c not in (None, '', ' ')]
+    has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty)
+    has_num = any(is_number(c) for c in non_empty)
+    return has_text and has_num
+
+def is_short_text_row(row):
+    """Deteksi baris teks pendek (1-2 kolom teks pendek)."""
+    non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')]
+    if not non_empty:
+        return False
+    text_only = all(not is_number(c) for c in non_empty)
+    joined = " ".join(non_empty)
+    return text_only and len(non_empty) <= 2 and len(joined) < 20
+
+
+
+
+
+
+
+
+def get_number_column_index(columns):
+    for i, col in enumerate(columns):
+        if has_number_header(col):
+            return i
+    return None
+
+def get_start_end_number(rows, idx):
+    try:
+        start_no = int(rows[0][idx])
+        end_no = int(rows[-1][idx])
+        return start_no, end_no
+    except:
+        return None, None
+
+def normalize_number_column(table):
+    columns = table["columns"]
+    rows = table["rows"]
+
+    num_idx = get_number_column_index(columns)
+    if num_idx is None:
+        return table
+
+    current = None
+
+    for row in rows:
+        try:
+            val = int(row[num_idx])
+        except:
+            continue
+
+        if current is None:
+            current = val
+        else:
+            if val <= current:
+                current += 1
+            else:
+                current = val
+
+        row[num_idx] = str(current)
+
+    return table
--- a/app/models/init.py
+++ b/app/models/init.py
--- a/app/models/base.py
+++ b/app/models/base.py
--- a/app/models/category_model.py
+++ b/app/models/category_model.py
--- a/app/models/classification_model.py
+++ b/app/models/classification_model.py
--- a/app/models/credential_model.py
+++ b/app/models/credential_model.py
--- a/app/models/feedback_model.py
+++ b/app/models/feedback_model.py
--- a/app/models/file_model.py
+++ b/app/models/file_model.py
--- a/app/models/map_access_model.py
+++ b/app/models/map_access_model.py
--- a/app/models/map_projection_system_model.py
+++ b/app/models/map_projection_system_model.py
--- a/app/models/map_source_model.py
+++ b/app/models/map_source_model.py
--- a/app/models/mapset_history_model.py
+++ b/app/models/mapset_history_model.py
--- a/app/models/mapset_model.py
+++ b/app/models/mapset_model.py
--- a/app/models/news_model.py
+++ b/app/models/news_model.py
--- a/app/models/organization_model.py
+++ b/app/models/organization_model.py
--- a/app/models/refresh_token_model.py
+++ b/app/models/refresh_token_model.py
--- a/app/models/regional_model.py
+++ b/app/models/regional_model.py
--- a/app/models/role_model.py
+++ b/app/models/role_model.py
--- a/app/models/user_model.py
+++ b/app/models/user_model.py
--- a/app/repositories/init.py
+++ b/app/repositories/init.py
--- a/app/repositories/base.py
+++ b/app/repositories/base.py
--- a/app/repositories/category_repository.py
+++ b/app/repositories/category_repository.py
--- a/app/repositories/classification_repository.py
+++ b/app/repositories/classification_repository.py
--- a/app/repositories/credential_repository.py
+++ b/app/repositories/credential_repository.py
--- a/app/repositories/feedback_repository.py
+++ b/app/repositories/feedback_repository.py
--- a/app/repositories/file_repository.py
+++ b/app/repositories/file_repository.py
--- a/app/repositories/map_access_repository.py
+++ b/app/repositories/map_access_repository.py
--- a/app/repositories/map_projection_system_repository.py
+++ b/app/repositories/map_projection_system_repository.py
--- a/app/repositories/map_source_repository.py
+++ b/app/repositories/map_source_repository.py
--- a/app/repositories/map_source_usage_repository.py
+++ b/app/repositories/map_source_usage_repository.py
--- a/app/repositories/mapset_history_repository.py
+++ b/app/repositories/mapset_history_repository.py
--- a/app/repositories/mapset_repository.py
+++ b/app/repositories/mapset_repository.py
--- a/app/repositories/news_repository.py
+++ b/app/repositories/news_repository.py
--- a/app/repositories/organization_repository.py
+++ b/app/repositories/organization_repository.py
--- a/app/repositories/regional_repository.py
+++ b/app/repositories/regional_repository.py
--- a/app/repositories/role_repository.py
+++ b/app/repositories/role_repository.py
--- a/app/repositories/token_repository.py
+++ b/app/repositories/token_repository.py
--- a/app/repositories/user_repository.py
+++ b/app/repositories/user_repository.py
--- a/app/response/res.py
+++ b/app/response/res.py
--- a/Show More
+++ b/Show More