This commit is contained in:
DmsAnhr 2026-02-23 12:20:42 +07:00
parent 654ad382fe
commit 6c0d8729f7
160 changed files with 3403 additions and 0 deletions

0
.DS_Store vendored Normal file → Executable file
View File

0
.env.example Normal file → Executable file
View File

3
.gitignore vendored Normal file → Executable file
View File

@ -172,3 +172,6 @@ cython_debug/
# PyPI configuration file # PyPI configuration file
.pypirc .pypirc
tests/

0
.pre-commit-config.yaml Normal file → Executable file
View File

0
.python-version Normal file → Executable file
View File

0
Dockerfile Normal file → Executable file
View File

0
README.md Normal file → Executable file
View File

11
addons.txt Executable file
View File

@ -0,0 +1,11 @@
pandas = "^3.0.0"
geopandas = "^1.1.2"
fiona = "^1.10.1"
numpy = "^2.4.2"
pdfplumber = "^0.11.9"
py7zr = "^1.1.0"
pyogrio = "^0.12.1"
rapidfuzz = "^3.14.3"
requests = "^2.32.5"
openpyxl = "^3.1.5"
pyarrow = "21.0.0"

0
alembic.ini Normal file → Executable file
View File

0
app/__init__.py Normal file → Executable file
View File

0
app/api/dependencies/__init__.py Normal file → Executable file
View File

0
app/api/dependencies/auth.py Normal file → Executable file
View File

0
app/api/dependencies/database.py Normal file → Executable file
View File

0
app/api/dependencies/factory.py Normal file → Executable file
View File

0
app/api/v1/__init__.py Normal file → Executable file
View File

0
app/api/v1/routes/__init__.py Normal file → Executable file
View File

0
app/api/v1/routes/auth_route.py Normal file → Executable file
View File

0
app/api/v1/routes/category_route.py Normal file → Executable file
View File

0
app/api/v1/routes/classification_route.py Normal file → Executable file
View File

0
app/api/v1/routes/count_route.py Normal file → Executable file
View File

0
app/api/v1/routes/credential_route.py Normal file → Executable file
View File

0
app/api/v1/routes/feedback_route.py Normal file → Executable file
View File

0
app/api/v1/routes/file_route.py Normal file → Executable file
View File

0
app/api/v1/routes/geonetwork_route.py Normal file → Executable file
View File

0
app/api/v1/routes/map_projection_system_route.py Normal file → Executable file
View File

0
app/api/v1/routes/map_source_route.py Normal file → Executable file
View File

0
app/api/v1/routes/mapset_history_route.py Normal file → Executable file
View File

0
app/api/v1/routes/mapset_route.py Normal file → Executable file
View File

0
app/api/v1/routes/news_route.py Normal file → Executable file
View File

0
app/api/v1/routes/organization_route.py Normal file → Executable file
View File

0
app/api/v1/routes/regional_route.py Normal file → Executable file
View File

0
app/api/v1/routes/role_route.py Normal file → Executable file
View File

0
app/api/v1/routes/user_route.py Normal file → Executable file
View File

0
app/core/__init__.py Normal file → Executable file
View File

0
app/core/config.py Normal file → Executable file
View File

0
app/core/data_types.py Normal file → Executable file
View File

0
app/core/database.py Normal file → Executable file
View File

0
app/core/exceptions.py Normal file → Executable file
View File

0
app/core/minio_client.py Normal file → Executable file
View File

0
app/core/params.py Normal file → Executable file
View File

0
app/core/responses.py Normal file → Executable file
View File

0
app/core/security.py Normal file → Executable file
View File

0
app/main.py Normal file → Executable file
View File

View File

View File

@ -0,0 +1,41 @@
# services/file_pipeline/router.py
from fastapi import APIRouter, Depends, File, UploadFile, Form
from .schemas import UploadRequest, PdfRequest
from app.mapset_pipeline.service import handle_file_analysis, process_pdf_file, execute_postgis_ingestion
from app.response.res import successRes, errorRes
router = APIRouter(prefix="/pipeline", tags=["File Pipeline"])
@router.post("/analyze")
async def upload_file(
file: UploadFile = File(...),
page: str = Form(""),
sheet: str = Form(""),
fileDesc: str = Form("")
):
try:
data = await handle_file_analysis(file, page, sheet, fileDesc)
return successRes(data=data)
except Exception as e:
return errorRes(message="Upload failed", details=str(e), status_code=500)
@router.post("/analyze/pdf")
async def upload_file(
payload: PdfRequest
):
try:
res = await process_pdf_file(payload)
return res
except Exception as e:
return errorRes(message="Upload failed", details=str(e), status_code=500)
@router.post("/publish")
async def process_to_postgis(payload: UploadRequest):
# user_id bisa diambil dari dependency injection auth
try:
data = await execute_postgis_ingestion(payload, user_id=2)
return successRes(data=data)
except Exception as e:
return errorRes(message="Processing failed", details=str(e), status_code=500)

View File

@ -0,0 +1,17 @@
from pydantic import BaseModel
from typing import List, Dict, Any
class PdfRequest(BaseModel):
title: str
columns: List[str]
rows: List[List]
fileName: str
fileDesc: str
class UploadRequest(BaseModel):
title: str
path: str
rows: List[dict]
columns: List[str]
author: Dict[str, Any]
style: str

View File

@ -0,0 +1,49 @@
import requests
from typing import Dict, Any
from app.core.config import GEN_AI_URL
URL = GEN_AI_URL
def generate_metadata(payload: Dict[str, Any]) -> Dict[str, Any]:
headers = {
"Content-Type": "application/json",
"API_KEY": "testsatupeta"
}
try:
response = requests.post(
f"{URL}",
json=payload,
headers=headers,
)
# response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
return {
"success": False,
"error": str(e)
}
if __name__ == "__main__":
# Contoh payload
payload = {
"nama_file_peta": "peta bencana.pdf",
"nama_opd": "Badan Penanggulangan Bencana Daerah (BPBD)",
"tipe_data_spasial": "Multipolygon",
"struktur_atribut_data": {},
"metadata": {
"judul": "",
"abstrak": "",
"tujuan": "",
"keyword": [],
"kategori": [],
"kategori_mapset": ""
}
}
result = generate_metadata(payload)
print(result)

View File

@ -0,0 +1,181 @@
import os
import asyncio
import pandas as pd
import geopandas as gpd
from app.response.res import errorRes
from app.mapset_pipeline.utils.file_ops import generate_unique_filename, dataframe_validation
from app.mapset_pipeline.utils.formatters import safe_json
from .geometry_build import is_geom_empty, detect_and_build_geometry, attach_polygon_geometry_auto
from app.mapset_pipeline.core.clients.ai_client import generate_metadata
from app.mapset_pipeline.core.publication.publish_geoserver import publish_layer_to_geoserver
from app.mapset_pipeline.core.publication.publish_geonetwork import publish_metadata
async def analyze_and_clean_dataframe(df: pd.DataFrame, ext: str, filename: str, fileDesc: str):
"""
Fungsi utama untuk memproses DataFrame:
1. Deteksi Geometri
2. Validasi & Hitung Statistik
3. Generate Preview & Warnings
4. Generate Metadata (AI)
5. Simpan ke Temporary Parquet
"""
# 1. Deteksi Geometri
result = detect_and_build_geometry(df, master_polygons=None)
if not hasattr(result, "geometry") or result.geometry.isna().all():
result = attach_polygon_geometry_auto(result)
def normalize_geom_type(geom_type):
if geom_type and geom_type.startswith("Multi"):
return geom_type.replace("Multi", "")
return geom_type
# 2. Analisis Tipe Geometri
if isinstance(result, gpd.GeoDataFrame) and "geometry" in result.columns:
geom_types = (
result.geometry
.dropna()
.geom_type
.apply(normalize_geom_type)
.unique()
)
geom_type = geom_types[0] if len(geom_types) > 0 else "None"
null_geom = result.geometry.isna().sum()
print(f"[INFO] Tipe Geometry: {geom_type}")
print(f"[INFO] Jumlah geometry kosong: {null_geom}")
else:
# Fallback jika gagal mendeteksi geometry
res = {
"message": "Tidak menemukan tabel yang relevan atau kolom geometri.",
"file_type": ext,
"rows": len(df),
"columns": len(df.columns),
"geometry_valid": 0,
"geometry_empty": 0,
"geometry_valid_percent": 0,
"warnings": [],
"warning_examples": [],
"preview": []
}
# Kita raise error dictionary agar bisa ditangkap oleh router/service
# Atau return dictionary error structure
return errorRes(message="Tidak berhasil mencocokan geometry pada tabel.", details=res, status_code=422)
# 3. Cleaning Data Values
result = result.replace([pd.NA, float('inf'), float('-inf')], None)
# Convert Geometry ke WKT untuk analisis teks
if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns:
# Kita perlu simpan WKT string agar serializable saat preview
# Tapi biarkan geometry asli untuk proses parquet nanti
pass
# Hitung Statistik Validitas
empty_count = result['geometry'].apply(is_geom_empty).sum()
valid_count = len(result) - empty_count
match_percentage = (valid_count / len(result)) * 100
warnings = []
if empty_count > 0:
warnings.append(
f"{empty_count} dari {len(result)} baris tidak memiliki geometry yang valid "
f"({100 - match_percentage:.2f}% data gagal cocok)."
)
# Ambil contoh data error
if empty_count > 0:
examples = result[result['geometry'].apply(is_geom_empty)].head(500)
warning_examples = examples.to_dict(orient="records")
else:
warning_examples = []
# Prepare Preview Data (Convert WKT for JSON response)
# Kita copy agar tidak merusak dataframe utama
data_df = result.copy()
if 'geometry' in data_df.columns:
data_df['geometry'] = data_df['geometry'].apply(
lambda g: g.wkt if g is not None else None
)
preview_data = data_df.to_dict(orient="records")
# Sanitasi JSON (numpy types -> python types)
preview_safe = [
{k: safe_json(v) for k, v in row.items()} for row in preview_data
]
warning_safe = [
{k: safe_json(v) for k, v in row.items()} for row in warning_examples
]
# 4. AI Metadata Generation
ai_context = {
"nama_file_peta": filename,
"nama_opd": "Badan Penanggulangan Bencana Daerah (BPBD) Provinsi Jatim", # Sebaiknya dinamis
"tipe_data_spasial": geom_type,
"deskripsi_singkat": fileDesc,
"struktur_atribut_data": {},
}
try:
ai_suggest = generate_metadata(ai_context)
except Exception as e:
print(f"[WARNING] Gagal generate metadata AI: {e}")
ai_suggest = {}
# 5. Simpan ke Temporary Parquet
# Gunakan filename unik agar thread safe
tmp_file = generate_unique_filename(folder="tmp", ext="parquet")
# Proses konversi synchronous dijalankan di thread terpisah agar tidak blocking
print('start')
await asyncio.to_thread(dataframe_validation, data_df, tmp_file)
print('pass')
response = {
"message": "File berhasil dibaca dan dianalisis.",
"file_name": filename,
"file_type": ext,
"rows": int(len(result)),
"columns": list(map(str, result.columns)),
"geometry_valid": int(valid_count),
"geometry_empty": int(empty_count),
"geometry_valid_percent": float(round(match_percentage, 2)),
"geometry_type": geom_type,
"warnings": warnings,
"warning_rows": warning_safe,
"preview": preview_safe,
"metadata_suggest": ai_suggest,
"tmp_path": tmp_file
}
return response
async def publish_mapset(table_name: str, job_id: str):
try:
geos_link = publish_layer_to_geoserver(table_name, job_id)
uuid = await publish_metadata(
table_name=table_name,
geoserver_links=geos_link
)
# await update_job_status(table_name, "FINISHED", job_id)
# return uuid
return {
"geos_link": geos_link["layer_url"],
# "uuid": uuid
"uuid": "123123"
}
except Exception as e:
# await update_job_status(table_name, "FAILED", job_id)
raise RuntimeError(f"Publish layer gagal: {e}") from e

View File

@ -0,0 +1,466 @@
import geopandas as gpd
from shapely.geometry import Point, LineString
import pandas as pd
import numpy as np
import re
import os
from shapely import wkt
from rapidfuzz import process, fuzz
from sqlalchemy import create_engine
from shapely.geometry.base import BaseGeometry
from app.core.config import REFERENCE_DB_URL, REFERENCE_SCHEMA, DESA_REF, KEC_REF, KAB_REF
# ============================================================
# KONFIGURASI DAN KONSTANTA
# ============================================================
COLUMN_ALIASES = {
'desa': ['desa', 'kelurahan', 'desa_kelurahan', 'desa/kelurahan', 'nama_desa', 'nama_kelurahan', 'Desa/Kel'],
'kecamatan': ['kec', 'kecamatan', 'nama_kec', 'nama_kecamatan'],
'kabupaten': ['kab', 'kabupaten', 'kota', 'kabupaten_kota', 'kota_kabupaten', 'kab/kota', 'kota/kabupaten', 'kota/kab']
}
# ============================================================
# FUNGSI BANTU ADMINISTRATIF
# ============================================================
def find_admin_column(df, aliases):
"""Mencari kolom yang paling cocok untuk tiap level admin (desa/kec/kab)"""
matched = {}
for level, alias_list in aliases.items():
for col in df.columns:
col_norm = col.strip().lower().replace(' ', '_').replace('/', '_')
if any(alias in col_norm for alias in alias_list):
matched[level] = col
break
return matched
def detect_smallest_admin_level(df):
"""Mendeteksi level administratif terkecil yang ada di DataFrame"""
cols = [c.lower() for c in df.columns]
if any('desa' in c or 'kelurahan' in c for c in cols):
return 'desa'
elif any('kecamatan' in c for c in cols):
return 'kecamatan'
elif any('kab' in c or 'kota' in c for c in cols):
return 'kabupaten'
return None
def fuzzy_merge(df, master, left_key, right_key, threshold=85):
"""Melakukan fuzzy matching antar nama wilayah"""
matches = df[left_key].apply(
lambda x: process.extractOne(str(x), master[right_key], score_cutoff=threshold)
)
df['match'] = matches.apply(lambda m: m[0] if m else None)
merged = df.merge(master, left_on='match', right_on=right_key, how='left')
return merged
def normalize_name(name: str, level: str = None):
if not isinstance(name, str):
return None
name = name.strip()
if not name:
return None
name = re.sub(r'\s*\([^)]*\)\s*', '', name)
raw = name.lower()
raw = re.sub(r'^(desa|kelurahan|kel|dusun|kampung)\s+', '', raw)
raw = re.sub(r'^(kecamatan|kec)\s+', '', raw)
raw = re.sub(r'^(kabupaten|kab\.?|kab)\s+', '', raw)
if level in ["kabupaten", "kota"]:
raw = re.sub(r'^(kota\s+)', '', raw)
raw = re.sub(r'[^a-z\s]', '', raw)
raw = re.sub(r'\s+', ' ', raw).strip()
tokens = raw.split()
merged_tokens = []
i = 0
while i < len(tokens):
if i < len(tokens) - 1:
sim = fuzz.ratio(tokens[i], tokens[i + 1])
if sim > 75:
merged_tokens.append(tokens[i] + tokens[i + 1])
i += 2
continue
merged_tokens.append(tokens[i])
i += 1
cleaned_tokens = []
prev = None
for tok in merged_tokens:
if prev and fuzz.ratio(prev, tok) > 95:
continue
cleaned_tokens.append(tok)
prev = tok
raw = " ".join(cleaned_tokens)
formatted = raw.title()
if level in ["kabupaten", "kota"]:
if "kota" in name.lower():
if not formatted.startswith("Kota "):
formatted = f"Kota {formatted}"
else:
formatted = formatted.replace("Kota ", "")
return formatted
def is_geom_empty(g):
"""True jika geometry None, NaN, atau geometry Shapely kosong."""
if g is None:
return True
if isinstance(g, float) and pd.isna(g):
return True
if isinstance(g, BaseGeometry):
return g.is_empty
return False
import math
def normalize_lon(val, is_lat=False):
if pd.isna(val):
return None
try:
v = float(val)
except:
return None
av = abs(v)
if av == 0:
return v
if (-180 <= v <= 180 and not is_lat) or (-90 <= v <= 90 and is_lat):
return v
for factor in [1, 10, 100, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9]:
nv = v / factor
if (not is_lat and -180 <= nv <= 180) or (is_lat and -90 <= nv <= 90):
return nv
return None
def normalize_lat(val):
if pd.isna(val):
return None
v = float(val)
av = abs(v)
if av > 1e9: # contoh: -8167413802 (10 digit)
return v / 1e9
elif av > 1e8: # fallback jika ada variasi
return v / 1e8
else:
return v
# ============================================================
# FUNGSI UTAMA GEOMETRY DETECTION (LAT/LON / PATH)
# ============================================================
def detect_and_build_geometry(df: pd.DataFrame, master_polygons: gpd.GeoDataFrame = None):
"""
Mendeteksi dan membentuk geometry dari DataFrame.
Bisa dari lat/lon, WKT, atau join ke master polygon (jika disediakan).
"""
if isinstance(df, gpd.GeoDataFrame):
geom_cols = [
c for c in df.columns
if re.match(r'^(geometry|geom|the_geom|wkb_geometry)$', c, re.IGNORECASE)
or c.lower().startswith("geom")
or c.lower().endswith("geometry")
]
# if "geometry" in df.columns and df.geometry.notna().any():
if geom_cols:
geom_count = df.geometry.notna().sum()
geom_type = list(df.geom_type.unique())
print(f"[INFO] Detected existing geometry in GeoDataFrame ({geom_count} features, {geom_type}).")
return df
lat_col = next((c for c in df.columns if re.search(r'\b(lat|latitude|y[_\s]*coord|y$)\b', c.lower())), None)
lon_col = next((c for c in df.columns if re.search(r'\b(lon|long|longitude|x[_\s]*coord|x$)\b', c.lower())), None)
if lat_col and lon_col:
df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
df[lon_col] = df[lon_col].apply(lambda x: normalize_lon(x, is_lat=False))
df[lat_col] = df[lat_col].apply(normalize_lat)
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326")
print("[INFO] Geometry dibangun dari kolom lat/lon.")
return gdf
coord_col = next(
(c for c in df.columns if re.search(r'(geom|geometry|wkt|shp|shape|path|coord)', c.lower())), None
)
if coord_col and df[coord_col].notnull().any():
sample_val = str(df[coord_col].dropna().iloc[0]).strip()
if sample_val.startswith('['):
def parse_geom(val):
try:
pts = eval(val)
return LineString(pts)
except Exception:
return None
df['geometry'] = df[coord_col].apply(parse_geom)
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
print("[INFO] Geometry dibangun dari kolom koordinat/path (list of points).")
return gdf
elif any(x in sample_val.upper() for x in ["POINT", "LINESTRING", "POLYGON"]):
try:
df['geometry'] = df[coord_col].apply(
lambda g: wkt.loads(g) if isinstance(g, str) and any(
x in g.upper() for x in ["POINT", "LINESTRING", "POLYGON"]
) else None
)
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
print("[INFO] Geometry dibangun dari kolom WKT (Point/Line/Polygon/MultiPolygon).")
return gdf
except Exception as e:
print(f"[WARN] Gagal parsing kolom geometry sebagai WKT: {e}")
if master_polygons is not None:
df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_').str.replace('/', '_')
matches = find_admin_column(df, COLUMN_ALIASES)
if 'desa' in matches:
admin_col = matches['desa']
merged = df.merge(master_polygons, left_on=admin_col, right_on='nama_desa', how='left')
if merged['geometry'].isna().sum() > 0:
merged = fuzzy_merge(df, master_polygons, admin_col, 'nama_desa')
gdf = gpd.GeoDataFrame(merged, geometry='geometry', crs=master_polygons.crs)
return gdf
elif 'kecamatan' in matches:
admin_col = matches['kecamatan']
merged = df.merge(master_polygons, left_on=admin_col, right_on='nama_kecamatan', how='left')
gdf = gpd.GeoDataFrame(merged, geometry='geometry', crs=master_polygons.crs)
return gdf
elif 'kabupaten' in matches:
admin_col = matches['kabupaten']
merged = df.merge(master_polygons, left_on=admin_col, right_on='nama_kabupaten', how='left')
gdf = gpd.GeoDataFrame(merged, geometry='geometry', crs=master_polygons.crs)
return gdf
print("[WARN] Tidak ditemukan geometry (lat/lon, path, atau master).")
return df
# def get_reference_polygons(level):
# """Mengambil data batas wilayah (MultiPolygon) dari DB referensi"""
# table_map = {
# 'desa': f"{REFERENCE_SCHEMA}.administrasi_ar_keldesa_jatim",
# 'kecamatan': f"{REFERENCE_SCHEMA}.administrasi_ar_kec_jatim",
# 'kabupaten': f"{REFERENCE_SCHEMA}.administrasi_ar_kabkot_jatim"
# }
# table_name = table_map.get(level)
# if not table_name:
# raise ValueError(f"Tidak ada tabel referensi untuk level '{level}'.")
# engine = create_engine(REFERENCE_DB_URL)
# query = f"SELECT *, ST_Multi(geom) AS geometry FROM {table_name}"
# gdf = gpd.read_postgis(query, engine, geom_col='geometry')
# print(f"[INFO] {len(gdf)} data referensi '{level}' berhasil dimuat dari {table_name}.")
# return gdf
from functools import lru_cache
@lru_cache(maxsize=3)
def get_reference_polygons(level):
local_path = f"cache/{level}_ref.parquet"
if os.path.exists(local_path):
print(f"[CACHE] Memuat referensi '{level}' dari file lokal.")
return gpd.read_parquet(local_path)
print(f"[DB] Mengambil data referensi '{level}' dari database...")
table_map = {
"desa": f"{REFERENCE_SCHEMA}.administrasi_ar_keldesa_jatim",
"kecamatan": f"{REFERENCE_SCHEMA}.administrasi_ar_kec_jatim",
"kabupaten": f"{REFERENCE_SCHEMA}.administrasi_ar_kabkot_jatim"
}
table_name = table_map.get(level)
engine = create_engine(REFERENCE_DB_URL)
query = f"SELECT *, ST_Multi(geom) AS geometry FROM {table_name}"
gdf = gpd.read_postgis(query, engine, geom_col="geometry")
gdf.to_parquet(local_path)
print(f"[CACHE] Disimpan ke {local_path}")
return gdf
# ============================================================
# Optimize Join
# ============================================================
def build_join_key(df, cols):
arr = df[cols].astype(str).replace("nan", "", regex=False).to_numpy()
return np.char.add.reduce(np.column_stack(
[arr[:, i] + ("|" if i < len(cols) - 1 else "") for i in range(len(cols))]
), axis=1)
# ============================================================
# FUNGSI: AUTO ATTACH POLYGON KE DATAFRAME NON-SPASIAL
# ============================================================
def attach_polygon_geometry_auto(df: pd.DataFrame):
"""
Tambahkan kolom geometry MultiPolygon berdasarkan kombinasi
(desa/kelurahan + kecamatan + kabupaten/kota), tanpa duplikasi baris.
"""
level = detect_smallest_admin_level(df)
if not level:
print("[WARN] Tidak ditemukan kolom administratif (desa/kecamatan/kabupaten).")
return df
print(f"[INFO] Detected smallest admin level: {level}")
ref_gdf = get_reference_polygons(level)
desa_col = next((c for c in df.columns if any(x in c.lower() for x in ['desa', 'kelurahan'])), None)
kec_col = next((c for c in df.columns if 'kec' in c.lower()), None)
kab_col = next((c for c in df.columns if any(x in c.lower() for x in ['kab', 'kota'])), None)
if desa_col and (not kec_col or not kab_col):
print("[ERROR] Kolom 'Desa' ditemukan tetapi kolom 'Kecamatan' dan/atau 'Kabupaten' tidak lengkap.")
print(f"[DEBUG] Ditemukan: Desa={desa_col}, Kec={kec_col}, Kab={kab_col}")
return df
elif not desa_col and kec_col and not kab_col:
print("[ERROR] Kolom 'Kecamatan' ditemukan tetapi kolom 'Kabupaten/Kota' tidak ditemukan.")
print(f"[DEBUG] Ditemukan: Desa={desa_col}, Kec={kec_col}, Kab={kab_col}")
return df
elif kab_col and not desa_col and not kec_col :
print("[INFO] Struktur kolom administratif valid (minimal Kabupaten/Kota ditemukan).")
print(f"[DEBUG] Ditemukan: Desa={desa_col}, Kec={kec_col}, Kab={kab_col}")
elif not desa_col and not kec_col and not kab_col:
print("[WARN] Tidak ditemukan kolom administratif apapun (Desa/Kecamatan/Kabupaten).")
print(f"[DEBUG] Kolom CSV: {list(df.columns)}")
return df
# kolom di referensi
desa_ref = DESA_REF
kec_ref = KEC_REF
kab_ref = KAB_REF
if desa_col is not None:
df[desa_col] = df[desa_col].astype(str).apply(lambda x: normalize_name(x, "desa"))
if kec_col is not None:
df[kec_col] = df[kec_col].astype(str).apply(lambda x: normalize_name(x, "kecamatan"))
if kab_col is not None:
df[kab_col] = df[kab_col].astype(str).apply(lambda x: normalize_name(x, "kabupaten"))
if desa_ref is not None:
ref_gdf[desa_ref] = ref_gdf[desa_ref].astype(str).apply(lambda x: normalize_name(x, "desa"))
if kec_ref is not None:
ref_gdf[kec_ref] = ref_gdf[kec_ref].astype(str).apply(lambda x: normalize_name(x, "kecamatan"))
if kab_ref is not None:
ref_gdf[kab_ref] = ref_gdf[kab_ref].astype(str).apply(lambda x: normalize_name(x, "kabupaten"))
join_cols = [col for col in [desa_col, kec_col, kab_col] if col]
if not join_cols:
print("[ERROR] Tidak ada kolom administratif yang bisa digunakan untuk join key.")
else:
join_cols_df = [col for col in [desa_col, kec_col, kab_col] if col]
join_cols_ref = [col for col in [desa_ref, kec_ref, kab_ref] if col]
common_depth = min(len(join_cols_df), len(join_cols_ref))
join_cols_df = join_cols_df[-common_depth:]
join_cols_ref = join_cols_ref[-common_depth:]
# print(f"[DEBUG] Join kolom DF : {join_cols_df}")
# print(f"[DEBUG] Join kolom REF : {join_cols_ref}")
# df["_join_key"] = df[join_cols_df].astype(str).agg("|".join, axis=1)
# ref_gdf["_join_key"] = ref_gdf[join_cols_ref].astype(str).agg("|".join, axis=1)
df["_join_key"] = build_join_key(df, join_cols_df)
ref_gdf["_join_key"] = build_join_key(ref_gdf, join_cols_ref)
# print(f"[INFO] Join key berhasil dibuat dari kolom: {join_cols_df}")
ref_lookup = ref_gdf[["_join_key", "geometry"]].drop_duplicates(subset=["_join_key"])
df = df.merge(ref_lookup, how="left", on="_join_key")
matched = df["geometry"].notna().sum()
# print(f"[INFO] {matched} dari {len(df)} baris cocok langsung berdasarkan (desa + kec + kab/kota).")
if matched < len(df):
unmatched = df[df["geometry"].isna()]
# print(f"[INFO] Melakukan fuzzy match untuk {len(unmatched)} baris yang belum cocok...")
ref_dict = dict(zip(ref_lookup["_join_key"], ref_lookup["geometry"]))
def find_fuzzy_geom(row):
key = row["_join_key"]
if not isinstance(key, str):
return None
# fuzzy old
# match = process.extractOne(key, list(ref_dict.keys()), scorer=fuzz.token_sort_ratio)
# fuzzy new
match = process.extractOne(
key, list(ref_dict.keys()), scorer=fuzz.token_set_ratio, score_cutoff=80
)
if match and match[1] >= 85:
return ref_dict[match[0]]
return None
df.loc[df["geometry"].isna(), "geometry"] = df[df["geometry"].isna()].apply(find_fuzzy_geom, axis=1)
df = df.drop(columns=["_join_key"], errors="ignore")
# admin_cols = [col for col in [desa_col, kec_col, kab_col] if col and col in df.columns]
# if matched < len(df):
# diff = df[df['geometry'].isna()][admin_cols]
# print("[DEBUG] Baris yang tidak match:")
# if diff.empty:
# print("(semua baris berhasil match)")
# else:
# print(diff.to_string(index=False))
# print(f"[REPORT] Total match: {df['geometry'].notna().sum()} / {len(df)} ({df['geometry'].notna().mean()*100:.2f}%)")
return gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")

View File

@ -0,0 +1,693 @@
from fastapi import HTTPException
import requests
from sqlalchemy import text
from app.core.config import GEONETWORK_PASS, GEONETWORK_URL, GEONETWORK_USER
from database.connection import engine
from datetime import datetime
from uuid import uuid4
import re
def create_gn_session():
session = requests.Session()
session.auth = (GEONETWORK_USER, GEONETWORK_PASS)
session.get(f"{GEONETWORK_URL}/srv/eng/info?type=me")
xsrf_token = session.cookies.get("XSRF-TOKEN")
if not xsrf_token:
raise Exception("XSRF token missing")
return session, xsrf_token
def escape_url_params(url: str) -> str:
"""
Escape karakter berbahaya di dalam URL agar valid dalam XML.
Khususnya mengganti '&' menjadi '&amp;' kecuali jika sudah '&amp;'.
"""
# Ganti semua & yang bukan bagian dari &amp;
url = re.sub(r'&(?!amp;)', '&amp;', url)
return url
def fix_xml_urls(xml: str) -> str:
"""
Temukan semua <gmd:URL> ... </gmd:URL> dalam XML dan escape URL-nya.
"""
def replacer(match):
original = match.group(1).strip()
fixed = escape_url_params(original)
return f"<gmd:URL>{fixed}</gmd:URL>"
# Replace semua <gmd:URL> ... </gmd:URL>
xml_fixed = re.sub(
r"<gmd:URL>(.*?)</gmd:URL>",
replacer,
xml,
flags=re.DOTALL
)
return xml_fixed
async def get_extent(table_name: str):
sql = f"""
SELECT
ST_XMin(extent), ST_YMin(extent),
ST_XMax(extent), ST_YMax(extent)
FROM (
SELECT ST_Extent(geom) AS extent
FROM public."{table_name}"
) AS box;
"""
async with engine.connect() as conn:
result = await conn.execute(sql)
row = result.fetchone()
if not row or row[0] is None:
return None
# return {
# "xmin": row[0],
# "ymin": row[1],
# "xmax": row[2],
# "ymax": row[3]
# }
return {
"xmin": 110.1372, # west
"ymin": -9.3029, # south
"xmax": 114.5287, # east
"ymax": -5.4819 # north
}
async def get_author_metadata(table_name: str):
sql = """
SELECT am.table_title, am.dataset_title, am.dataset_abstract, am.keywords, am.date_created,
am.organization_name, am.contact_person_name, am.created_at,
am.contact_email, am.contact_phone, am.geom_type,
u.organization_id,
o.address AS organization_address,
o.email AS organization_email,
o.phone_number AS organization_phone
FROM backend.author_metadata AS am
LEFT JOIN backend.users u ON am.user_id = u.id
LEFT JOIN backend.organizations o ON u.organization_id = o.id
WHERE am.table_title = :table
LIMIT 1
"""
async with engine.connect() as conn:
result = await conn.execute(sql, {"table": table_name})
row = result.fetchone()
if not row:
raise Exception(f"Tidak ada metadata untuk tabel: {table_name}")
# SQLAlchemy Async row support ._mapping untuk convert ke dict
return dict(row._mapping)
def map_geom_type(gtype):
if gtype is None:
return "surface"
# Jika LIST → ambil elemen pertama
if isinstance(gtype, list):
if len(gtype) > 0:
gtype = gtype[0]
else:
return "surface"
# Setelah pasti string
gtype = str(gtype).lower()
if "polygon" in gtype or "multi" in gtype:
return "surface"
if "line" in gtype:
return "curve"
if "point" in gtype:
return "point"
return "surface"
def generate_metadata_xml(table_name, meta, extent, geoserver_links):
keywords_xml = "".join([
f"""
<gmd:keyword><gco:CharacterString>{kw.strip()}</gco:CharacterString></gmd:keyword>
""" for kw in meta["keywords"].split(",")
])
geom_type_code = map_geom_type(meta["geom_type"])
print('type', geom_type_code)
uuid = str(uuid4())
return f"""
<gmd:MD_Metadata
xmlns:gmd="http://www.isotc211.org/2005/gmd"
xmlns:gco="http://www.isotc211.org/2005/gco"
xmlns:srv="http://www.isotc211.org/2005/srv"
xmlns:gmx="http://www.isotc211.org/2005/gmx"
xmlns:gts="http://www.isotc211.org/2005/gts"
xmlns:gsr="http://www.isotc211.org/2005/gsr"
xmlns:gmi="http://www.isotc211.org/2005/gmi"
xmlns:gml="http://www.opengis.net/gml/3.2"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.isotc211.org/2005/gmd http://schemas.opengis.net/csw/2.0.2/profiles/apiso/1.0.0/apiso.xsd">
<gmd:fileIdentifier>
<gco:CharacterString>{uuid}</gco:CharacterString>
</gmd:fileIdentifier>
<gmd:language>
<gmd:LanguageCode codeList="http://www.loc.gov/standards/iso639-2/" codeListValue="eng"/>
</gmd:language>
<gmd:characterSet>
<gmd:MD_CharacterSetCode codeListValue="utf8" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_CharacterSetCode"/>
</gmd:characterSet>
<gmd:hierarchyLevel>
<gmd:MD_ScopeCode codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_ScopeCode" codeListValue="feature"/>
</gmd:hierarchyLevel>
<gmd:contact>
<gmd:CI_ResponsibleParty>
<gmd:individualName>
<gco:CharacterString>{meta['contact_person_name']}</gco:CharacterString>
</gmd:individualName>
<gmd:organisationName>
<gco:CharacterString>{meta['organization_name']}</gco:CharacterString>
</gmd:organisationName>
<gmd:contactInfo>
<gmd:CI_Contact>
<gmd:phone>
<gmd:CI_Telephone>
<gmd:voice>
<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
</gmd:voice>
<gmd:facsimile>
<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
</gmd:facsimile>
</gmd:CI_Telephone>
</gmd:phone>
<gmd:address>
<gmd:CI_Address>
<gmd:deliveryPoint>
<gco:CharacterString>{meta['organization_address']}</gco:CharacterString>
</gmd:deliveryPoint>
<gmd:city>
<gco:CharacterString>Surabaya</gco:CharacterString>
</gmd:city>
<gmd:administrativeArea>
<gco:CharacterString>Jawa Timur</gco:CharacterString>
</gmd:administrativeArea>
<gmd:country>
<gco:CharacterString>Indonesia</gco:CharacterString>
</gmd:country>
<gmd:electronicMailAddress>
<gco:CharacterString>{meta['organization_email']}</gco:CharacterString>
</gmd:electronicMailAddress>
</gmd:CI_Address>
</gmd:address>
<gmd:hoursOfService>
<gco:CharacterString>08.00-16.00</gco:CharacterString>
</gmd:hoursOfService>
</gmd:CI_Contact>
</gmd:contactInfo>
<gmd:role>
<gmd:CI_RoleCode codeListValue="pointOfContact" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#CI_RoleCode"/>
</gmd:role>
</gmd:CI_ResponsibleParty>
</gmd:contact>
<gmd:dateStamp>
<gco:DateTime>{datetime.utcnow().isoformat()}+07:00</gco:DateTime>
</gmd:dateStamp>
<gmd:metadataStandardName>
<gco:CharacterString>ISO 19115:2003/19139</gco:CharacterString>
</gmd:metadataStandardName>
<gmd:metadataStandardVersion>
<gco:CharacterString>1.0</gco:CharacterString>
</gmd:metadataStandardVersion>
<gmd:spatialRepresentationInfo>
<gmd:MD_VectorSpatialRepresentation>
<gmd:geometricObjects>
<gmd:MD_GeometricObjects>
<gmd:geometricObjectType>
<gmd:MD_GeometricObjectTypeCode codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_GeometricObjectTypeCode" codeListValue="{geom_type_code}"/>
</gmd:geometricObjectType>
<gmd:geometricObjectCount>
<gco:Integer>38</gco:Integer>
</gmd:geometricObjectCount>
</gmd:MD_GeometricObjects>
</gmd:geometricObjects>
</gmd:MD_VectorSpatialRepresentation>
</gmd:spatialRepresentationInfo>
<gmd:referenceSystemInfo>
<gmd:MD_ReferenceSystem>
<gmd:referenceSystemIdentifier>
<gmd:RS_Identifier>
<gmd:code>
<gco:CharacterString>4326</gco:CharacterString>
</gmd:code>
<gmd:codeSpace>
<gco:CharacterString>EPSG</gco:CharacterString>
</gmd:codeSpace>
</gmd:RS_Identifier>
</gmd:referenceSystemIdentifier>
</gmd:MD_ReferenceSystem>
</gmd:referenceSystemInfo>
<gmd:identificationInfo>
<gmd:MD_DataIdentification>
<gmd:citation>
<gmd:CI_Citation>
<gmd:title>
<gco:CharacterString>{meta['dataset_title']}</gco:CharacterString>
</gmd:title>
<gmd:date>
<gmd:CI_Date>
<gmd:date>
<gco:DateTime>{meta['created_at'].isoformat()}+07:00</gco:DateTime>
</gmd:date>
<gmd:dateType>
<gmd:CI_DateTypeCode codeListValue="publication" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#CI_DateTypeCode"/>
</gmd:dateType>
</gmd:CI_Date>
</gmd:date>
<gmd:edition>
<gco:CharacterString>{meta['date_created'].year}</gco:CharacterString>
</gmd:edition>
<gmd:citedResponsibleParty>
<gmd:CI_ResponsibleParty>
<gmd:individualName>
<gco:CharacterString>{meta['contact_person_name']}</gco:CharacterString>
</gmd:individualName>
<gmd:organisationName>
<gco:CharacterString>{meta['organization_name']}</gco:CharacterString>
</gmd:organisationName>
<gmd:contactInfo>
<gmd:CI_Contact>
<gmd:phone>
<gmd:CI_Telephone>
<gmd:voice>
<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
</gmd:voice>
<gmd:facsimile>
<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
</gmd:facsimile>
</gmd:CI_Telephone>
</gmd:phone>
<gmd:address>
<gmd:CI_Address>
<gmd:deliveryPoint>
<gco:CharacterString>{meta['organization_address']}</gco:CharacterString>
</gmd:deliveryPoint>
<gmd:city>
<gco:CharacterString>Surabaya</gco:CharacterString>
</gmd:city>
<gmd:country>
<gco:CharacterString>Indonesia</gco:CharacterString>
</gmd:country>
<gmd:electronicMailAddress>
<gco:CharacterString>{meta['organization_email']}</gco:CharacterString>
</gmd:electronicMailAddress>
</gmd:CI_Address>
</gmd:address>
<gmd:hoursOfService>
<gco:CharacterString>08.00-16.00</gco:CharacterString>
</gmd:hoursOfService>
</gmd:CI_Contact>
</gmd:contactInfo>
<gmd:role>
<gmd:CI_RoleCode codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#CI_RoleCode" codeListValue="custodian"/>
</gmd:role>
</gmd:CI_ResponsibleParty>
</gmd:citedResponsibleParty>
<gmd:otherCitationDetails>
<gco:CharacterString>Timezone: UTC+7 (Asia/Jakarta)</gco:CharacterString>
</gmd:otherCitationDetails>
</gmd:CI_Citation>
</gmd:citation>
<gmd:abstract>
<gco:CharacterString>{meta['dataset_abstract']}</gco:CharacterString>
</gmd:abstract>
<gmd:purpose>
<gco:CharacterString>{meta['dataset_abstract']}</gco:CharacterString>
</gmd:purpose>
<gmd:status>
<gmd:MD_ProgressCode codeListValue="completed" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_ProgressCode"/>
</gmd:status>
<gmd:pointOfContact>
<gmd:CI_ResponsibleParty>
<gmd:individualName>
<gco:CharacterString>Lab AI Polinema</gco:CharacterString>
</gmd:individualName>
<gmd:organisationName>
<gco:CharacterString>Lab AI Polinema</gco:CharacterString>
</gmd:organisationName>
<gmd:positionName gco:nilReason="missing"/>
<gmd:contactInfo>
<gmd:CI_Contact>
<gmd:phone>
<gmd:CI_Telephone>
<gmd:voice>
<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
</gmd:voice>
<gmd:facsimile>
<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
</gmd:facsimile>
</gmd:CI_Telephone>
</gmd:phone>
<gmd:address>
<gmd:CI_Address>
<gmd:deliveryPoint>
<gco:CharacterString>{meta['organization_address']}</gco:CharacterString>
</gmd:deliveryPoint>
<gmd:city>
<gco:CharacterString>Surabaya</gco:CharacterString>
</gmd:city>
<gmd:administrativeArea>
<gco:CharacterString>Jawa Timur</gco:CharacterString>
</gmd:administrativeArea>
<gmd:country>
<gco:CharacterString>Indonesia</gco:CharacterString>
</gmd:country>
<gmd:electronicMailAddress>
<gco:CharacterString>{meta['organization_email']}</gco:CharacterString>
</gmd:electronicMailAddress>
</gmd:CI_Address>
</gmd:address>
</gmd:CI_Contact>
</gmd:contactInfo>
<gmd:role>
<gmd:CI_RoleCode codeListValue="owner" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#CI_RoleCode"/>
</gmd:role>
</gmd:CI_ResponsibleParty>
</gmd:pointOfContact>
<gmd:resourceMaintenance>
<gmd:MD_MaintenanceInformation>
<gmd:maintenanceAndUpdateFrequency>
<gmd:MD_MaintenanceFrequencyCode codeListValue="annually" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_MaintenanceFrequencyCode"/>
</gmd:maintenanceAndUpdateFrequency>
</gmd:MD_MaintenanceInformation>
</gmd:resourceMaintenance>
<gmd:descriptiveKeywords>
<gmd:MD_Keywords>
{keywords_xml}
</gmd:MD_Keywords>
</gmd:descriptiveKeywords>
<gmd:resourceConstraints>
<gmd:MD_LegalConstraints>
<gmd:accessConstraints>
<gmd:MD_RestrictionCode codeListValue="copyright" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_RestrictionCode"/>
</gmd:accessConstraints>
<gmd:useConstraints>
<gmd:MD_RestrictionCode codeListValue="otherRestrictions" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_RestrictionCode"/>
</gmd:useConstraints>
<gmd:otherConstraints>
<gco:CharacterString>Penggunaan data harus mencantumkan sumber: {meta['organization_name']}.</gco:CharacterString>
</gmd:otherConstraints>
</gmd:MD_LegalConstraints>
</gmd:resourceConstraints>
<gmd:spatialRepresentationType>
<gmd:MD_SpatialRepresentationTypeCode codeListValue="vector" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_SpatialRepresentationTypeCode"/>
</gmd:spatialRepresentationType>
<gmd:spatialResolution>
<gmd:MD_Resolution>
<gmd:equivalentScale>
<gmd:MD_RepresentativeFraction>
<gmd:denominator>
<gco:Integer>25000</gco:Integer>
</gmd:denominator>
</gmd:MD_RepresentativeFraction>
</gmd:equivalentScale>
</gmd:MD_Resolution>
</gmd:spatialResolution>
<gmd:language>
<gmd:LanguageCode codeList="http://www.loc.gov/standards/iso639-2/" codeListValue="eng"/>
</gmd:language>
<gmd:characterSet>
<gmd:MD_CharacterSetCode codeListValue="utf8" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_CharacterSetCode"/>
</gmd:characterSet>
<gmd:extent>
<gmd:EX_Extent>
<gmd:geographicElement>
<gmd:EX_GeographicBoundingBox>
<gmd:westBoundLongitude><gco:Decimal>{extent['xmin']}</gco:Decimal></gmd:westBoundLongitude>
<gmd:eastBoundLongitude><gco:Decimal>{extent['xmax']}</gco:Decimal></gmd:eastBoundLongitude>
<gmd:southBoundLatitude><gco:Decimal>{extent['ymin']}</gco:Decimal></gmd:southBoundLatitude>
<gmd:northBoundLatitude><gco:Decimal>{extent['ymax']}</gco:Decimal></gmd:northBoundLatitude>
</gmd:EX_GeographicBoundingBox>
</gmd:geographicElement>
</gmd:EX_Extent>
</gmd:extent>
</gmd:MD_DataIdentification>
</gmd:identificationInfo>
<gmd:contentInfo>
<gmd:MD_FeatureCatalogueDescription>
<gmd:complianceCode>
<gco:Boolean>true</gco:Boolean>
</gmd:complianceCode>
<gmd:includedWithDataset gco:nilReason="unknown"/>
<gmd:featureCatalogueCitation>
<gmd:CI_Citation>
<gmd:title>
<gco:CharacterString>{meta['dataset_title']}</gco:CharacterString>
</gmd:title>
<gmd:date>
<gmd:CI_Date>
<gmd:date>
<gco:DateTime>{meta['created_at'].isoformat()}+07:00</gco:DateTime>
</gmd:date>
<gmd:dateType>
<gmd:CI_DateTypeCode codeListValue="publication" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#CI_DateTypeCode"/>
</gmd:dateType>
</gmd:CI_Date>
</gmd:date>
<gmd:edition>
<gco:CharacterString>{meta['date_created'].year}</gco:CharacterString>
</gmd:edition>
</gmd:CI_Citation>
</gmd:featureCatalogueCitation>
</gmd:MD_FeatureCatalogueDescription>
</gmd:contentInfo>
<gmd:distributionInfo>
<gmd:MD_Distribution>
<gmd:transferOptions>
<gmd:MD_DigitalTransferOptions>
<gmd:onLine>
<gmd:CI_OnlineResource>
<gmd:linkage>
<gmd:URL>{geoserver_links["wms_url"]}</gmd:URL>
</gmd:linkage>
<gmd:protocol>
<gco:CharacterString>DB:POSTGIS</gco:CharacterString>
</gmd:protocol>
<gmd:name>
<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
</gmd:name>
<gmd:description>
<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
</gmd:description>
</gmd:CI_OnlineResource>
</gmd:onLine>
<gmd:onLine>
<gmd:CI_OnlineResource>
<gmd:linkage>
<gmd:URL>{geoserver_links["wms_url"]}</gmd:URL>
</gmd:linkage>
<gmd:protocol>
<gco:CharacterString>WWW:LINK-1.0-http--link</gco:CharacterString>
</gmd:protocol>
<gmd:name>
<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
</gmd:name>
<gmd:description>
<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
</gmd:description>
</gmd:CI_OnlineResource>
</gmd:onLine>
<gmd:onLine>
<gmd:CI_OnlineResource>
<gmd:linkage>
<gmd:URL>{geoserver_links["wms_url"]}</gmd:URL>
</gmd:linkage>
<gmd:protocol>
<gco:CharacterString>OGC:WMS</gco:CharacterString>
</gmd:protocol>
<gmd:name>
<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
</gmd:name>
</gmd:CI_OnlineResource>
</gmd:onLine>
<gmd:onLine>
<gmd:CI_OnlineResource>
<gmd:linkage>
<gmd:URL>{geoserver_links["wfs_url"]}</gmd:URL>
</gmd:linkage>
<gmd:protocol>
<gco:CharacterString>OGC:WFS</gco:CharacterString>
</gmd:protocol>
<gmd:name>
<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
</gmd:name>
</gmd:CI_OnlineResource>
</gmd:onLine>
</gmd:MD_DigitalTransferOptions>
</gmd:transferOptions>
</gmd:MD_Distribution>
</gmd:distributionInfo>
<gmd:dataQualityInfo>
<gmd:DQ_DataQuality>
<gmd:scope>
<gmd:DQ_Scope>
<gmd:level>
<gmd:MD_ScopeCode codeListValue="dataset" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_ScopeCode"/>
</gmd:level>
</gmd:DQ_Scope>
</gmd:scope>
<gmd:lineage>
<gmd:LI_Lineage>
<gmd:statement>
<gco:CharacterString>Data dihasilkan dari digitasi peta dasar skala 1:25000 menggunakan QGIS.</gco:CharacterString>
</gmd:statement>
</gmd:LI_Lineage>
</gmd:lineage>
</gmd:DQ_DataQuality>
</gmd:dataQualityInfo>
</gmd:MD_Metadata>
"""
# Geonetwork version 4.4.9.0
def upload_metadata_to_geonetwork(xml_metadata: str):
# session = requests.Session()
# session.auth = (GEONETWORK_USER, GEONETWORK_PASS)
# # 1. Get XSRF token
# try:
# info_url = f"{GEONETWORK_URL}/srv/eng/info?type=me"
# session.get(info_url)
# except requests.exceptions.RequestException as e:
# raise HTTPException(status_code=503, detail=f"Failed to connect to GeoNetwork: {e}")
# xsrf_token = session.cookies.get('XSRF-TOKEN')
# if not xsrf_token:
# raise HTTPException(status_code=500, detail="Could not retrieve XSRF-TOKEN from GeoNetwork.")
session, xsrf_token = create_gn_session()
headers = {
'X-XSRF-TOKEN': xsrf_token,
'Accept': 'application/json'
}
GN_API_RECORDS_URL = f"{GEONETWORK_URL}/srv/api/records"
# 2. GeoNetwork requires a multipart/form-data upload
files = {
'file': ('metadata.xml', xml_metadata, 'application/xml')
}
params = {
"ownerGroup": 1, # all
"ownerUser": 1 # admin
}
response = session.post(
GN_API_RECORDS_URL,
params=params,
files=files,
headers=headers,
cookies=session.cookies.get_dict()
)
metadata_infos = response.json().get("metadataInfos", {})
uuid = None
for records in metadata_infos.values():
if records and isinstance(records, list):
uuid = records[0].get("uuid")
break
if not uuid:
raise ValueError("UUID not found in GeoNetwork response")
publish_record(session, uuid)
# print("response", response.json())
return uuid
async def publish_metadata(table_name: str, geoserver_links: dict):
extent = await get_extent(table_name)
meta = await get_author_metadata(table_name)
xml = generate_metadata_xml(
table_name=meta["dataset_title"],
meta=meta,
extent=extent,
geoserver_links=geoserver_links
)
xml_clean = fix_xml_urls(xml)
uuid = upload_metadata_to_geonetwork(xml_clean)
print(f"[GeoNetwork] Metadata uploaded. UUID = {uuid}")
return uuid
def publish_record(session, uuid):
print('[uuid]', uuid)
xsrf_token = session.cookies.get('XSRF-TOKEN')
headers = {
"X-XSRF-TOKEN": xsrf_token,
"Accept": "application/json",
"Content-Type": "application/json"
}
url = f"{GEONETWORK_URL}/srv/api/records/{uuid}/sharing"
payload = {
"clear": True,
"privileges": [
{
"group": 1,
"operations": {
"view": True
}
}
]
}
response = session.put(url, json=payload, headers=headers)
response.raise_for_status()
# single stand func
# def publish_record(uuid):
# session, xsrf_token = create_gn_session()
# headers = {
# "X-XSRF-TOKEN": xsrf_token,
# "Content-Type": "application/json"
# }
# url = f"{GEONETWORK_URL}/srv/api/records/{uuid}/sharing"
# payload = {
# "clear": True,
# "privileges": [
# {"group": 1, "operations": {"view": True}}
# ]
# }
# resp = session.put(url, json=payload, headers=headers)
# resp.raise_for_status()

View File

@ -0,0 +1,300 @@
import requests
import json
import os
from app.core.config import GEOSERVER_URL, GEOSERVER_USER, GEOSERVER_PASS, GEOSERVER_WORKSPACE
# DATASTORE = "postgis" #per OPD
DATASTORE = "server_lokal"
# SLD_DIR = "./styles"
# BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# SLD_DIR = os.path.join(BASE_DIR, "styles")
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
MAIN_DIR = os.path.abspath(os.path.join(BASE_DIR, "..", ".."))
SLD_DIR = os.path.join(MAIN_DIR, "style_temp")
def publish_layer_to_geoserver(table: str, job_id: str):
print(f"[GeoServer] Publish layer + upload SLD: {table}")
# ==========================
# 1. Publish Feature Type
# ==========================
# ft_url = f"{GEOSERVER_URL}/rest/workspaces/{GEOSERVER_WORKSPACE}/datastores/{DATASTORE}/featuretypes"
ft_url = f"{GEOSERVER_URL}/rest/workspaces/{GEOSERVER_WORKSPACE}/datastores/{DATASTORE}/featuretypes?computeDefault=true"
payload = {
"featureType": {
"name": table,
"nativeName": table,
"enabled": True
}
}
requests.post(
ft_url,
auth=(GEOSERVER_USER, GEOSERVER_PASS),
headers={"Content-Type": "application/json"},
data=json.dumps(payload)
)
print(f"[GeoServer] FeatureType published for: {table}")
# ==========================================
# 2. Upload SLD file to GeoServer
# ==========================================
sld_file = f"{SLD_DIR}/{job_id}.sld"
style_name = table # style name sama dengan table
if not os.path.exists(sld_file):
print(f"[WARNING] SLD file tidak ditemukan: {sld_file}")
else:
print(f"[GeoServer] Upload SLD {sld_file}")
#old
# style_url = f"{GEOSERVER_URL}/rest/styles"
# with open(sld_file, "rb") as sld:
# requests.post(
# f"{style_url}?name={style_name}&workspace={GEOSERVER_WORKSPACE}",
# auth=(GEOSERVER_USER, GEOSERVER_PASS),
# headers={"Content-Type": "application/vnd.ogc.sld+xml"},
# data=sld.read()
# )
# print(f"[GeoServer] SLD uploaded: {style_name}")
#new
style_url = (
f"{GEOSERVER_URL}/rest/workspaces/"
f"{GEOSERVER_WORKSPACE}/styles"
)
with open(sld_file, "r", encoding="utf-8") as f:
sld_content = f.read()
# 🔥 INI BARIS PENTINGNYA
sld_content = sld_content.lstrip("\ufeff \t\r\n")
resp = requests.post(
f"{style_url}?name={style_name}",
auth=(GEOSERVER_USER, GEOSERVER_PASS),
headers={"Content-Type": "application/vnd.ogc.sld+xml"},
data=sld_content.encode("utf-8")
)
if resp.status_code not in (200, 201):
raise Exception(
f"Upload SLD gagal ({resp.status_code}): {resp.text}"
)
print(f"[GeoServer] SLD uploaded: {style_name}")
# ==========================================
# 3. Apply SLD to the layer
# ==========================================
layer_url = f"{GEOSERVER_URL}/rest/layers/{GEOSERVER_WORKSPACE}:{table}"
payload = {
"layer": {
"defaultStyle": {
"name": style_name,
"workspace": GEOSERVER_WORKSPACE
},
"enabled": True
}
}
requests.put(
layer_url,
auth=(GEOSERVER_USER, GEOSERVER_PASS),
headers={"Content-Type": "application/json"},
data=json.dumps(payload)
)
print(f"[GeoServer] SLD applied as default style for {table}")
# ==========================================
# 4. Delete SLD file from local folder
# ==========================================
os.remove(sld_file)
print(f"[CLEANUP] SLD file removed: {sld_file}")
# ==============================================
# 5. Reload GeoServer (optional but recommended)
# ==============================================
requests.post(
f"{GEOSERVER_URL}/rest/reload",
auth=(GEOSERVER_USER, GEOSERVER_PASS)
)
# ====================================================
# 7. Generate GeoServer WMS/WFS link untuk GeoNetwork
# ====================================================
wms_link = (
f"{GEOSERVER_URL}/{GEOSERVER_WORKSPACE}/wms?"
f"service=WMS&request=GetMap&layers={GEOSERVER_WORKSPACE}:{table}"
)
wfs_link = (
f"{GEOSERVER_URL}/{GEOSERVER_WORKSPACE}/wfs?"
f"service=WFS&request=GetFeature&typeName={GEOSERVER_WORKSPACE}:{table}"
)
# print(f"[GeoServer] WMS URL: {wms_link}")
# print(f"[GeoServer] WFS URL: {wfs_link}")
# print(f"[GeoServer] Reload completed. Layer {table} ready.")
openlayer_url = (
f"{GEOSERVER_URL}/{GEOSERVER_WORKSPACE}/wms?"
f"service=WMS"
f"&version=1.1.0"
f"&request=GetMap"
f"&layers={GEOSERVER_WORKSPACE}:{table}"
f"&styles="
f"&bbox=110.89528623700005%2C-8.780412043999945%2C116.26994997700001%2C-5.042971664999925"
f"&width=768"
f"&height=384"
f"&srs=EPSG:4326"
f"&format=application/openlayers"
)
return {
"table": table,
"style": style_name,
"wms_url": wms_link,
"wfs_url": wfs_link,
"layer_url": openlayer_url
}
# use default style
# def publish_layer_to_geoserver(table: str):
# print(f"[GeoServer] Publish layer: {table}")
# # ========== 1. Publish Feature Type ==========
# ft_url = f"{GEOSERVER_URL}/rest/workspaces/{WORKSPACE}/datastores/{DATASTORE}/featuretypes"
# payload = {
# "featureType": {
# "name": table,
# "nativeName": table,
# "enabled": True
# }
# }
# requests.post(
# ft_url,
# auth=(GEOSERVER_USER, GEOSERVER_PASS),
# headers={"Content-Type": "application/json"},
# data=json.dumps(payload)
# )
# # ===================================================
# # 2. Tentukan SLD file (prioritas table.sld → fallback default)
# # ===================================================
# table_sld = SLD_DIR / f"{table}.sld"
# default_sld = SLD_DIR / "default_style.sld"
# if table_sld.exists():
# chosen_sld = table_sld
# delete_after = True
# style_name = table # pakai nama style sama dengan layer
# print(f"[SLD] Menggunakan SLD khusus: {chosen_sld}")
# else:
# chosen_sld = default_sld
# delete_after = False
# style_name = "default_style"
# print(f"[SLD] Menggunakan default SLD: {chosen_sld}")
# # ==========================================
# # 3. Upload SLD
# # ==========================================
# style_url = f"{GEOSERVER_URL}/rest/styles"
# with open(chosen_sld, "rb") as sld:
# requests.post(
# f"{style_url}?name={style_name}&workspace={WORKSPACE}",
# auth=(GEOSERVER_USER, GEOSERVER_PASS),
# headers={"Content-Type": "application/vnd.ogc.sld+xml"},
# data=sld.read()
# )
# print(f"[GeoServer] SLD uploaded: {style_name}")
# # ==========================================
# # 4. Apply SLD ke layer
# # ==========================================
# layer_url = f"{GEOSERVER_URL}/rest/layers/{WORKSPACE}:{table}"
# payload = {
# "layer": {
# "defaultStyle": {
# "name": style_name,
# "workspace": WORKSPACE
# },
# "enabled": True
# }
# }
# requests.put(
# layer_url,
# auth=(GEOSERVER_USER, GEOSERVER_PASS),
# headers={"Content-Type": "application/json"},
# data=json.dumps(payload)
# )
# print(f"[GeoServer] Style '{style_name}' applied to layer '{table}'")
# # ==========================================
# # 5. Delete table.sld jika ada
# # ==========================================
# if delete_after:
# table_sld.unlink()
# print(f"[CLEANUP] File SLD '{table}.sld' dihapus")
# # ====================================================
# # 6. Reload GeoServer (opsional tapi aman)
# # ====================================================
# requests.post(
# f"{GEOSERVER_URL}/rest/reload",
# auth=(GEOSERVER_USER, GEOSERVER_PASS)
# )
# # ====================================================
# # 7. Generate GeoServer WMS/WFS link untuk GeoNetwork
# # ====================================================
# wms_link = (
# f"{GEOSERVER_URL}/{WORKSPACE}/wms?"
# f"service=WMS&request=GetMap&layers={WORKSPACE}:{table}"
# )
# wfs_link = (
# f"{GEOSERVER_URL}/{WORKSPACE}/wfs?"
# f"service=WFS&request=GetFeature&typeName={WORKSPACE}:{table}"
# )
# print(f"[GeoServer] WMS URL: {wms_link}")
# print(f"[GeoServer] WFS URL: {wfs_link}")
# return {
# "table": table,
# "style": style_name,
# "wms_url": wms_link,
# "wfs_url": wfs_link
# }

View File

@ -0,0 +1,18 @@
# Import fungsi utama dari masing-masing file reader
# (Titik '.' berarti import dari folder yang sama)
from .reader_csv import read_csv
from .reader_shp import read_shp
from .reader_gdb import read_gdb
from .reader_mpk import read_mpk
from .reader_pdf import read_pdf, convert_df
# Opsional: Mendefinisikan apa yang akan ter-import jika orang mengetik "from ... import *"
__all__ = [
"read_csv",
"read_shp",
"read_gdb",
"read_mpk",
"read_pdf",
"convert_df"
]

View File

@ -0,0 +1,228 @@
import pandas as pd
import re
import csv
import os
def detect_header_line(path, max_rows=10):
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
lines = [next(f) for _ in range(max_rows)]
header_line_idx = 0
best_score = -1
for i, line in enumerate(lines):
cells = re.split(r'[;,|\t]', line.strip())
alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
score = alpha_ratio - digit_ratio
if score > best_score:
best_score = score
header_line_idx = i
return header_line_idx
def detect_delimiter(path, sample_size=2048):
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
sample = f.read(sample_size)
sniffer = csv.Sniffer()
try:
dialect = sniffer.sniff(sample)
return dialect.delimiter
except Exception:
for delim in [',', ';', '\t', '|']:
if delim in sample:
return delim
return ','
# def read_csv(path: str, sheet: str = None):
# ext = os.path.splitext(path)[1].lower()
# try:
# if ext in ['.csv']:
# header_line = detect_header_line(path)
# delimiter = detect_delimiter(path)
# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
# df = pd.read_csv(
# path,
# header=header_line,
# sep=delimiter,
# encoding='utf-8',
# low_memory=False,
# thousands=','
# )
# elif ext in ['.xlsx', '.xls']:
# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
# xls = pd.ExcelFile(path)
# print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
# if sheet:
# if sheet not in xls.sheet_names:
# raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}")
# print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
# df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str)
# df = df.dropna(how='all').dropna(axis=1, how='all')
# else:
# print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...")
# best_sheet = None
# best_score = -1
# best_df = None
# for sheet_name in xls.sheet_names:
# try:
# temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
# temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all')
# if len(temp_df) == 0 or len(temp_df.columns) < 2:
# continue
# # hitung skor relevansi
# text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1)
# row_score = len(temp_df)
# score = (row_score * 0.7) + (text_ratio * 100)
# if score > best_score:
# best_score = score
# best_sheet = sheet_name
# best_df = temp_df
# except Exception as e:
# print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
# continue
# if best_df is not None:
# print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
# df = best_df
# else:
# raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
# for col in df.columns:
# if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
# df[col] = df[col].astype(str).str.replace(',', '', regex=False)
# df[col] = pd.to_numeric(df[col], errors='ignore')
# else:
# raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
# except Exception as e:
# print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
# df.columns = [str(c).strip() for c in df.columns]
# df = df.dropna(how='all')
# return df
def read_csv(path: str, sheet: str = None):
ext = os.path.splitext(path)[1].lower()
df = pd.DataFrame() # Inisialisasi default
try:
# --- BLOK PEMBACAAN FILE ---
if ext in ['.csv']:
header_line = detect_header_line(path)
delimiter = detect_delimiter(path)
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
df = pd.read_csv(
path,
header=header_line,
sep=delimiter,
encoding='utf-8',
low_memory=False,
thousands=','
)
elif ext in ['.xlsx', '.xls']:
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
xls = pd.ExcelFile(path, engine='openpyxl') # Pakai engine openpyxl
print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
if sheet:
if sheet not in xls.sheet_names:
raise ValueError(f"Sheet '{sheet}' tidak ditemukan.")
print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
# Tambahkan engine='openpyxl'
df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str, engine='openpyxl')
df = df.dropna(how='all').dropna(axis=1, how='all')
else:
# Logika pencarian sheet terbaik (tidak berubah, hanya indentasi)
print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...")
best_sheet = None
best_score = -1
best_df = None
for sheet_name in xls.sheet_names:
try:
temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str, engine='openpyxl')
temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all')
if len(temp_df) == 0 or len(temp_df.columns) < 2:
continue
text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1)
row_score = len(temp_df)
score = (row_score * 0.7) + (text_ratio * 100)
if score > best_score:
best_score = score
best_sheet = sheet_name
best_df = temp_df
except Exception as e:
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
continue
if best_df is not None:
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
df = best_df
else:
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
else:
raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
# --- BLOK PEMBERSIHAN (Dilakukan setelah file sukses terbaca) ---
# Kita bungkus ini agar error konversi angka TIDAK menggagalkan pembacaan file
if not df.empty:
df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
df.columns = [str(c).strip() for c in df.columns]
df = df.dropna(how='all')
# Konversi Angka yang Lebih Aman
for col in df.columns:
try:
# Cek apakah kolom terlihat seperti angka
if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
# Bersihkan koma
clean_col = df[col].astype(str).str.replace(',', '', regex=False)
# Gunakan errors='coerce' agar jika ada error value (NaN/REF), dia jadi NaN, bukan crash
df[col] = pd.to_numeric(clean_col, errors='coerce')
except Exception as ex:
# Jika konversi gagal, biarkan sebagai string/object dan lanjut ke kolom berikutnya
print(f"[WARN] Gagal konversi numerik pada kolom '{col}': {ex}")
pass
return df
except Exception as e:
# --- ERROR HANDLING YANG BENAR ---
print(f"[WARN] Gagal membaca file utama ({e}).")
# Hanya lakukan fallback CSV jika file aslinya MEMANG CSV (atau txt)
# Jangan paksa baca .xlsx pakai read_csv
if ext in ['.csv', '.txt']:
print("[INFO] Mencoba fallback ke default CSV reader...")
try:
return pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
except Exception as e2:
print(f"[ERROR] Fallback CSV juga gagal: {e2}")
# Jika file Excel gagal dibaca, return DataFrame kosong atau raise error
print("[ERROR] Tidak dapat memulihkan pembacaan file Excel.")
return pd.DataFrame()

View File

@ -0,0 +1,75 @@
import geopandas as gpd
import fiona
import zipfile
import tempfile
import os
import shutil
def read_gdb(zip_path: str, layer: str = None):
if not zip_path.lower().endswith(".zip"):
raise ValueError("File GDB harus berupa ZIP yang berisi folder .gdb atau file .gdbtable")
tmpdir = tempfile.mkdtemp()
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(tmpdir)
macosx_path = os.path.join(tmpdir, "__MACOSX")
if os.path.exists(macosx_path):
shutil.rmtree(macosx_path)
gdb_folders = []
for root, dirs, _ in os.walk(tmpdir):
for d in dirs:
if d.lower().endswith(".gdb"):
gdb_folders.append(os.path.join(root, d))
if not gdb_folders:
gdbtable_files = []
for root, _, files in os.walk(tmpdir):
for f in files:
if f.lower().endswith(".gdbtable"):
gdbtable_files.append(os.path.join(root, f))
if gdbtable_files:
first_folder = os.path.dirname(gdbtable_files[0])
base_name = os.path.basename(first_folder)
gdb_folder_path = os.path.join(tmpdir, f"{base_name}.gdb")
os.makedirs(gdb_folder_path, exist_ok=True)
for fpath in os.listdir(first_folder):
if ".gdb" in fpath.lower():
shutil.move(os.path.join(first_folder, fpath), os.path.join(gdb_folder_path, fpath))
gdb_folders.append(gdb_folder_path)
# print(f"[INFO] Rebuilt GDB folder from nested structure: {gdb_folder_path}")
else:
# print("[DEBUG] Isi ZIP:", os.listdir(tmpdir))
shutil.rmtree(tmpdir)
raise ValueError("Tidak ditemukan folder .gdb atau file .gdbtable di dalam ZIP")
gdb_path = gdb_folders[0]
layers = fiona.listlayers(gdb_path)
# print(f"[INFO] Layer tersedia: {layers}")
chosen_layer = layer or (layers[0] if layers else None)
if not chosen_layer:
shutil.rmtree(tmpdir)
raise ValueError("Tidak ada layer GDB yang bisa dibaca.")
print(f"[DEBUG] Membaca layer: {chosen_layer}")
try:
gdf = gpd.read_file(gdb_path, layer=chosen_layer)
except Exception as e:
shutil.rmtree(tmpdir)
raise ValueError(f"Gagal membaca layer dari GDB: {e}")
if gdf.crs is None:
# print("[WARN] CRS tidak terdeteksi, diasumsikan EPSG:4326")
gdf.set_crs("EPSG:4326", inplace=True)
shutil.rmtree(tmpdir)
return gdf

View File

@ -0,0 +1,72 @@
import os
import tempfile
import json
from io import BytesIO
import geopandas as gpd
from py7zr import SevenZipFile
import pyogrio
def find_data_source(extract_dir: str):
"""
Cari data sumber (.gdb atau .shp) di dalam folder hasil ekstrak.
"""
for root, dirs, _ in os.walk(extract_dir):
for d in dirs:
if d.lower().endswith(".gdb"):
return os.path.join(root, d)
for root, _, files in os.walk(extract_dir):
for f in files:
if f.lower().endswith(".shp"):
return os.path.join(root, f)
raise ValueError("Tidak ditemukan data source yang didukung (.gdb atau .shp).")
def get_main_layer(gdb_path: str):
"""
Ambil nama layer utama dari geodatabase (.gdb).
"""
try:
layers = pyogrio.list_layers(gdb_path)
for layer in layers:
if not layer[0].lower().endswith("__attach"):
return layer[0]
if layers:
return layers[0][0]
raise ValueError(f"Tidak ada layer utama yang valid di {gdb_path}")
except Exception as e:
raise ValueError(f"Gagal membaca daftar layer GDB: {e}")
def read_mpk(path: str):
mpk_bytes = None
with open(path, "rb") as f:
mpk_bytes = f.read()
if not mpk_bytes:
raise ValueError("File MPK kosong atau tidak valid.")
with tempfile.TemporaryDirectory() as tempdir:
try:
with SevenZipFile(BytesIO(mpk_bytes), mode="r") as z:
z.extractall(path=tempdir)
except Exception as e:
raise ValueError(f"File MPK rusak atau tidak valid: {e}")
src_path = find_data_source(tempdir)
if src_path.lower().endswith(".gdb"):
layer_name = get_main_layer(src_path)
gdf = gpd.read_file(src_path, layer=layer_name)
else:
gdf = gpd.read_file(src_path)
if gdf.crs is None:
raise ValueError("CRS tidak terdeteksi. Pastikan file memiliki informasi proyeksi (.prj).")
gdf = gdf.to_crs(epsg=4326)
print(f"[INFO] Berhasil membaca {len(gdf)} fitur")
return gdf

View File

@ -0,0 +1,288 @@
import re
import pdfplumber
import pandas as pd
from app.mapset_pipeline.utils.pdf_cleaner import get_number_column_index, get_start_end_number, normalize_number_column, row_ratio, has_mixed_text_and_numbers, is_short_text_row, parse_page_selection, filter_geo_admin_column, cleaning_column
from services.upload_file.upload_exceptions import PDFReadError
from utils.logger_config import setup_logger
logger = setup_logger(__name__)
def detect_header_rows(rows):
if not rows:
return []
ratios = [row_ratio(r) for r in rows]
body_start_index = None
for i in range(1, len(rows)):
row = rows[i]
if has_mixed_text_and_numbers(row):
body_start_index = i
break
if ratios[i] > 0.3:
body_start_index = i
break
if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row):
body_start_index = i
break
if ratios[i - 1] == 0 and ratios[i] > 0:
body_start_index = i
break
if body_start_index is None:
body_start_index = len(rows)
potential_headers = rows[:body_start_index]
body_filtered = rows[body_start_index:]
header_filtered = []
for idx, row in enumerate(potential_headers):
if is_short_text_row(row):
if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0:
header_filtered.append(row)
else:
continue
else:
header_filtered.append(row)
return header_filtered, body_filtered
def merge_multiline_header(header_rows):
final_header = []
for col in zip(*header_rows):
val = next((v for v in reversed(col) if v and str(v).strip()), '')
val = str(val).replace('\n', ' ').strip()
final_header.append(val)
final_header = [v for v in final_header if v not in ['', None]]
return final_header
def merge_parsed_table(tables):
roots = []
fragments = []
# STEP 1: klasifikasi
for table in tables:
num_idx = get_number_column_index(table["columns"])
if num_idx is None:
roots.append(table)
continue
start_no, _ = get_start_end_number(table["rows"], num_idx)
if start_no == 1:
roots.append(table)
else:
fragments.append(table)
# STEP 2: merge fragment ke root
for frag in fragments:
frag_idx = get_number_column_index(frag["columns"])
f_start, _ = get_start_end_number(frag["rows"], frag_idx)
for root in roots:
if root["columns"] != frag["columns"]:
continue
root_idx = get_number_column_index(root["columns"])
_, r_end = get_start_end_number(root["rows"], root_idx)
if f_start == r_end + 1:
root["rows"].extend(frag["rows"])
break # fragment hanya boleh nempel ke 1 root
return roots
def read_pdf(path: str, page: str):
"""
Membaca tabel dari file PDF secara semi-otomatis menggunakan `pdfplumber`.
Alur utama proses:
1. **Buka file PDF** menggunakan pdfplumber.
2. **Pilih halaman** berdasarkan input `page` (misalnya "1,3-5" untuk halaman 1 dan 35).
3. **Deteksi tabel** di setiap halaman yang dipilih.
4. **Ekstraksi tabel mentah** (list of list) dari setiap halaman.
5. **Pisahkan baris header dan body** dengan fungsi `detect_header_rows()`.
6. **Gabungkan header multi-baris** (misalnya tabel dengan dua baris judul kolom).
7. **Bersihkan body tabel** menggunakan `cleaning_column()`:
- Menghapus kolom nomor urut.
- Menyesuaikan jumlah kolom dengan header.
8. **Gabungkan hasil akhir** ke dalam format JSON dengan struktur:
{
"title": <nomor tabel>,
"columns": [...],
"rows": [...]
}
9. **Filter tambahan** dengan `filter_geo_admin_column()` (khusus metadata geospasial).
10. **Kembalikan hasil** berupa list JSON siap dikirim ke frontend API.
Args:
path (str): Lokasi file PDF yang akan dibaca.
page (str): Nomor halaman atau rentang halaman, contoh: "1", "2-4", "1,3-5".
Returns:
list[dict]: Daftar tabel hasil ekstraksi dengan struktur kolom dan baris.
Raises:
PDFReadError: Jika terjadi kesalahan saat membaca atau parsing PDF.
"""
# try:
# pdf_path = path
# selectedPage = page if page else "1"
# tables_data = []
# with pdfplumber.open(pdf_path) as pdf:
# total_pages = len(pdf.pages)
# selected_pages = parse_page_selection(selectedPage, total_pages)
# logger.info(f"[INFO] Total Halaman PDF: {total_pages}")
# logger.info(f"[INFO] Total Halaman yang dipilih: {len(selected_pages)}")
# logger.info(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
# for page_num in selected_pages:
# pdf_page = pdf.pages[page_num - 1]
# tables = pdf_page.find_tables()
# logger.info(f"\n\n[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
# # pembacaan title ini tidak valid untuk halaman lanscape
# # for line in pdf_page.extract_text_lines():
# # if line['top'] > tables[0].bbox[1]:
# # break
# # previous_line = line
# # print('[TITLE]', previous_line['text'])
# for i, t in enumerate(tables, start=1):
# table = t.extract()
# if len(table) > 2:
# print(f"[TBL] tabel : {i} - halaman {page_num}")
# tables_data.append(table)
# logger.info(f"\nTotal tabel terbaca: {len(tables_data)}\n")
# header_only, body_only = [], []
# for tbl in tables_data:
# head, body = detect_header_rows(tbl)
# header_only.append(head)
# body_only.append(body)
# clean_header = [merge_multiline_header(h) for h in header_only]
# clean_body = []
# for i, raw_body in enumerate(body_only):
# con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
# cleaned = cleaning_column(clean_header[i], [con_body])
# clean_body.append(cleaned[0])
# parsed = []
# for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
# parsed.append({
# "title": str(i),
# "columns": cols,
# "rows": rows
# })
# # =================================================================
# clean_parsed = filter_geo_admin_column(parsed)
# merge_parsed = merge_parsed_table(clean_parsed)
# logger.info(f"\nTotal tabel valid: {len(merge_parsed)}\n")
# ordered_tables = [normalize_number_column(t) for t in merge_parsed]
# return ordered_tables
# except Exception as e:
# raise PDFReadError(f"Gagal membaca PDF: {e}", code=422)
try:
pdf_path = path
selectedPage = page if page else "1"
tables_data = []
with pdfplumber.open(pdf_path) as pdf:
total_pages = len(pdf.pages)
selected_pages = parse_page_selection(selectedPage, total_pages)
logger.info(f"[INFO] Total Halaman PDF: {total_pages}")
logger.info(f"[INFO] Total Halaman yang dipilih: {len(selected_pages)}")
logger.info(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
for page_num in selected_pages:
pdf_page = pdf.pages[page_num - 1]
tables = pdf_page.find_tables()
logger.info(f"\n\n[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
# pembacaan title ini tidak valid untuk halaman lanscape
# for line in pdf_page.extract_text_lines():
# if line['top'] > tables[0].bbox[1]:
# break
# previous_line = line
# print('[TITLE]', previous_line['text'])
for i, t in enumerate(tables, start=1):
table = t.extract()
if len(table) > 2:
print(f"[TBL] tabel : {i} - halaman {page_num}")
tables_data.append({"page": f"halaman {page_num} - {i}", "table": table})
logger.info(f"\nTotal tabel terbaca: {len(tables_data)}\n")
header_only, body_only, page_info = [], [], []
for tbl in tables_data:
head, body = detect_header_rows(tbl["table"])
header_only.append(head)
body_only.append(body)
page_info.append(tbl["page"])
clean_header = [merge_multiline_header(h) for h in header_only]
clean_body = []
for i, raw_body in enumerate(body_only):
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
cleaned = cleaning_column(clean_header[i], [con_body])
clean_body.append(cleaned[0])
parsed = []
for i, (cols, rows, page) in enumerate(zip(clean_header, clean_body, page_info), start=1):
parsed.append({
"title": page,
"columns": cols,
"rows": rows
})
# =================================================================
clean_parsed = filter_geo_admin_column(parsed)
merge_parsed = merge_parsed_table(clean_parsed)
logger.info(f"\nTotal tabel valid: {len(merge_parsed)}\n")
ordered_tables = [normalize_number_column(t) for t in merge_parsed]
return ordered_tables
except Exception as e:
raise PDFReadError(f"Gagal membaca PDF: {e}", code=422)
def convert_df(payload):
try:
if "columns" not in payload or "rows" not in payload:
raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.")
if not isinstance(payload["columns"], list):
raise TypeError("'columns' harus berupa list.")
if not isinstance(payload["rows"], list):
raise TypeError("'rows' harus berupa list.")
for i, row in enumerate(payload["rows"]):
if len(row) != len(payload["columns"]):
raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.")
df = pd.DataFrame(payload["rows"], columns=payload["columns"])
if "title" in payload:
df.attrs["title"] = payload["title"]
return df
except Exception as e:
raise PDFReadError(f"Gagal konversi payload ke DataFrame: {e}", code=400)

View File

@ -0,0 +1,60 @@
import geopandas as gpd
import fiona
import zipfile
import tempfile
import os
import shutil
from shapely.geometry import shape
def read_shp(path: str):
if not path:
raise ValueError("Path shapefile tidak boleh kosong.")
tmpdir = None
shp_path = None
if path.lower().endswith(".zip"):
tmpdir = tempfile.mkdtemp()
with zipfile.ZipFile(path, "r") as zip_ref:
zip_ref.extractall(tmpdir)
shp_files = []
for root, _, files in os.walk(tmpdir):
for f in files:
if f.lower().endswith(".shp"):
shp_files.append(os.path.join(root, f))
if not shp_files:
raise ValueError("Tidak ditemukan file .shp di dalam ZIP.")
shp_path = shp_files[0]
print(f"[DEBUG] Membaca shapefile: {os.path.basename(shp_path)}")
else:
shp_path = path
try:
gdf = gpd.read_file(shp_path)
except Exception as e:
raise ValueError(f"Gagal membaca shapefile: {e}")
if "geometry" not in gdf.columns or gdf.geometry.is_empty.all():
print("[WARN] Geometry kosong. Mencoba membangun ulang dari fitur mentah...")
with fiona.open(shp_path) as src:
features = []
for feat in src:
geom = shape(feat["geometry"]) if feat["geometry"] else None
props = feat["properties"]
props["geometry"] = geom
features.append(props)
gdf = gpd.GeoDataFrame(features, geometry="geometry", crs=src.crs)
if gdf.crs is None:
# print("[WARN] CRS tidak terdeteksi. Diasumsikan EPSG:4326")
gdf.set_crs("EPSG:4326", inplace=True)
if tmpdir and os.path.exists(tmpdir):
shutil.rmtree(tmpdir)
return gdf

View File

@ -0,0 +1,259 @@
import os
import json
import asyncio
import pandas as pd
from shapely import wkt, wkb
from shapely.geometry import MultiPolygon, MultiLineString
from sqlalchemy import text
from sqlalchemy.exc import SQLAlchemyError
# Import koneksi database Anda
from database.connection import engine
from app.mapset_pipeline.utils.formatters import str_to_date
async def generate_unique_table_name(base_name: str) -> str:
"""Generate nama tabel unik, menambahkan suffix angka jika sudah ada."""
base_name = base_name.lower().replace(" ", "_").replace("-", "_")
table_name = base_name
counter = 2
async with engine.connect() as conn:
while True:
# Cek keberadaan tabel di schema public (atau default search path)
result = await conn.execute(
text("SELECT to_regclass(:tname)"),
{"tname": table_name}
)
exists = result.scalar()
if not exists:
return table_name
table_name = f"{base_name}_{counter}"
counter += 1
async def insert_parquet_to_postgis(filename: str, table_name: str):
"""
Membaca file parquet sementara, membersihkan data, dan melakukan COPY
ke PostGIS menggunakan asyncpg pool untuk performa tinggi.
"""
from main import db_pool
file_path = os.path.join("tmp", filename)
if not os.path.exists(file_path):
raise FileNotFoundError(f"File temp {file_path} tidak ditemukan")
try:
loop = asyncio.get_running_loop()
# Baca parquet (CPU bound, run in executor jika file sangat besar)
df = await loop.run_in_executor(None, pd.read_parquet, file_path)
# 1. CLEANING NAMA KOLOM
df.columns = [str(col).strip().upper() for col in df.columns]
# Standarisasi kolom GEOM
if "GEOM" in df.columns:
df.rename(columns={"GEOM": "GEOM"}, inplace=True)
if "GEOM" not in df.columns:
raise ValueError("Kolom GEOM tidak ditemukan dalam Parquet")
# 2. PREPARE DATA ROWS
clean_rows = []
geom_types = set()
# Atribut selain GEOM
attr_columns = [col for col in df.columns if col != "GEOM"]
for row in df.itertuples(index=False):
# --- Handle GEOM ---
raw_geom = getattr(row, "GEOM", None)
if not raw_geom: continue
try:
geom = None
if isinstance(raw_geom, str):
geom = wkt.loads(raw_geom)
elif isinstance(raw_geom, bytes):
geom = wkb.loads(raw_geom)
if not geom: continue
# Fix Invalid Geometry
if not geom.is_valid:
geom = geom.buffer(0)
# Force Multi-Geometry agar seragam
gtype = geom.geom_type.upper()
if gtype == "POLYGON": geom = MultiPolygon([geom])
elif gtype == "LINESTRING": geom = MultiLineString([geom])
geom_types.add(geom.geom_type)
# Convert ke EWKT (SRID 4326)
ewkt = f"SRID=4326;{geom.wkt}"
except Exception:
continue # Skip baris dengan geom rusak
# --- Handle Attributes (FORCE STRING) ---
row_data = []
for col in attr_columns:
val = getattr(row, col, None)
if val is not None:
row_data.append(str(val))
else:
row_data.append(None)
row_data.append(ewkt)
clean_rows.append(tuple(row_data))
if not clean_rows:
raise ValueError("Data valid kosong setelah pemrosesan geometry")
# 3. DATABASE OPERATIONS
final_geom_type = list(geom_types)[0].upper() if geom_types else "GEOM"
if "MULTI" not in final_geom_type and final_geom_type != "GEOM":
final_geom_type = "MULTI" + final_geom_type
# A. CREATE TABLE
col_defs = [f'"{col}" TEXT' for col in attr_columns] # Semua atribut jadi TEXT dulu agar aman
create_sql = f"""
CREATE TABLE {table_name} (
_id SERIAL PRIMARY KEY,
{', '.join(col_defs)},
geom TEXT
);
"""
async with db_pool.acquire() as conn:
# Create Table
await conn.execute(create_sql)
# B. COPY Data (Bulk Insert)
target_cols = attr_columns + ['geom']
# asyncpg otomatis meng-quote nama kolom
await conn.copy_records_to_table(
table_name,
records=clean_rows,
columns=target_cols
)
# C. ALTER COLUMN GEOMETRY & INDEX
alter_sql = f"""
ALTER TABLE {table_name}
ALTER COLUMN geom TYPE geometry({final_geom_type}, 4326)
USING ST_Force2D(geom::geometry)::geometry({final_geom_type}, 4326);
CREATE INDEX idx_{table_name}_geom ON {table_name} USING GIST (geom);
"""
await conn.execute(alter_sql)
print(f"[SUCCESS] Upload {len(clean_rows)} baris ke tabel {table_name}.")
# Hapus file temp setelah sukses
try:
os.remove(file_path)
except OSError:
pass
return {
"table_name": table_name,
"row_count": len(clean_rows),
"geom_type": final_geom_type
}
except Exception as e:
print(f"[ERROR] Processing parquet to DB: {e}")
raise e
async def save_author_metadata(payload_author: dict, table_name: str, dataset_title: str,
geom_types: list, row_count: int, user_id: int):
"""
Menyimpan metadata author dan informasi dataset ke tabel backend.author_metadata.
"""
query = text("""
INSERT INTO backend.author_metadata (
table_title,
dataset_title,
dataset_abstract,
keywords,
topic_category,
date_created,
dataset_status,
organization_name,
contact_person_name,
contact_email,
contact_phone,
geom_type,
user_id,
process,
geometry_count
) VALUES (
:table_title,
:dataset_title,
:dataset_abstract,
:keywords,
:topic_category,
:date_created,
:dataset_status,
:organization_name,
:contact_person_name,
:contact_email,
:contact_phone,
:geom_type,
:user_id,
:process,
:geometry_count
)
""")
params = {
"table_title": table_name,
"dataset_title": dataset_title,
"dataset_abstract": payload_author.get("abstract"),
"keywords": payload_author.get("keywords"),
"topic_category": ", ".join(payload_author.get("topicCategory", [])),
"date_created": str_to_date(payload_author.get("dateCreated")),
"dataset_status": payload_author.get("status"),
"organization_name": payload_author.get("organization"),
"contact_person_name": payload_author.get("contactName"),
"contact_email": payload_author.get("contactEmail"),
"contact_phone": payload_author.get("contactPhone"),
"geom_type": json.dumps(geom_types),
"user_id": user_id,
"process": 'CLEANSING',
"geometry_count": row_count
}
async with engine.begin() as conn:
await conn.execute(query, params)
async def call_cleansing_procedure(table_name: str):
"""
Menjalankan stored procedure cleansing geometry di database.
"""
try:
print(f"[INFO] Memulai cleansing database untuk tabel: {table_name}")
async with engine.begin() as conn:
# Menggunakan parameter binding yang aman
await conn.execute(
text("CALL pr_cleansing_satupeta_polygon(:table_name, NULL);"),
{"table_name": table_name}
)
print(f"[SUCCESS] Cleansing selesai untuk tabel: {table_name}")
return "done"
except SQLAlchemyError as e:
print(f"[ERROR] Cleansing database gagal: {e}")
# Kita raise error agar Service tahu kalau proses ini gagal
raise RuntimeError(f"Database cleansing failed: {str(e)}")

286
app/mapset_pipeline/service.py Executable file
View File

@ -0,0 +1,286 @@
import os
import shutil
import pandas as pd
from fastapi import UploadFile, HTTPException
from typing import Optional
# --- Internal Modules ---
from .api.schemas import UploadRequest, PdfRequest
from .core.processing.analyzer import analyze_and_clean_dataframe, publish_mapset
from .core.readers import (
read_csv,
read_shp,
read_gdb,
read_mpk,
read_pdf,
convert_df
)
from .data.repository import (
generate_unique_table_name,
insert_parquet_to_postgis,
save_author_metadata,
call_cleansing_procedure
)
from app.mapset_pipeline.utils.file_ops import (
detect_zip_type,
generate_job_id,
)
from app.mapset_pipeline.utils.formatters import (
save_xml_to_sld,
)
# --- Legacy/External Modules (Sesuai kode asli Anda) ---
from app.core.config import UPLOAD_FOLDER, MAX_FILE_MB, GEONETWORK_URL
from utils.logger_config import log_activity
# from api.routers.datasets_router import (
# upload_to_main
# )
async def handle_file_analysis(
file: UploadFile,
page: Optional[str] = "",
sheet: Optional[str] = "",
fileDesc: Optional[str] = ""
):
"""
Orchestrator untuk endpoint /upload.
1. Simpan file fisik.
2. Pilih Reader berdasarkan ekstensi.
3. Panggil Processor untuk analisis.
4. Bersihkan file fisik.
"""
fname = file.filename
ext = os.path.splitext(fname)[1].lower()
# 1. Validasi & Simpan File
# Membaca file in-memory untuk cek ukuran (hati-hati memory usage untuk file besar)
contents = await file.read()
size_mb = len(contents) / (1024 * 1024)
if size_mb > MAX_FILE_MB:
raise HTTPException(status_code=413, detail="Ukuran File Terlalu Besar")
tmp_path = UPLOAD_FOLDER / fname
# Pastikan folder ada
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
with open(tmp_path, "wb") as f:
f.write(contents)
df = None
try:
# 2. Routing Reader Berdasarkan Ekstensi
print(f"[INFO] Processing file type: {ext}")
if ext == ".csv":
df = read_csv(str(tmp_path))
elif ext == ".xlsx":
df = read_csv(str(tmp_path), sheet) # Asumsi read_csv handle xlsx juga sesuai kode asli
elif ext == ".mpk":
df = read_mpk(str(tmp_path))
elif ext == ".pdf":
# Logic PDF agak unik, bisa return list tabel atau df
tbl = read_pdf(tmp_path, page)
if len(tbl) == 0:
return {
"message": "Tidak ditemukan tabel valid pada halaman yang dipilih",
"tables": {},
"file_type": ext
}
elif len(tbl) > 1:
return {
"message": "File berhasil dibaca, ditemukan banyak tabel.",
"tables": tbl,
"file_type": ext
}
else:
df = convert_df(tbl[0])
elif ext == ".zip":
zip_type = detect_zip_type(str(tmp_path))
if zip_type == "shp":
df = read_shp(str(tmp_path))
elif zip_type == "gdb":
df = read_gdb(str(tmp_path))
else:
raise HTTPException(status_code=400, detail="ZIP file tidak mengandung SHP / GDB valid.")
else:
raise HTTPException(status_code=400, detail="Unsupported file type")
# Cek Dataframe Kosong
if df is None or (hasattr(df, "empty") and df.empty):
raise HTTPException(status_code=422, detail="File berhasil dibaca, tetapi tidak ditemukan tabel valid")
# 3. Panggil Processor (Logic Cleaning & Validasi)
result_analysis = await analyze_and_clean_dataframe(df, ext, fname, fileDesc)
return result_analysis
except Exception as e:
print(f"[ERROR] handle_file_analysis: {e}")
raise HTTPException(status_code=500, detail=str(e))
finally:
# 4. Cleanup Uploaded File (Raw File)
# Kita hapus file upload asli, tapi file temp parquet (hasil processor)
# tetap hidup sampai frontend mengirim request ingest
if tmp_path.exists():
try:
os.remove(tmp_path)
except Exception:
pass
async def process_pdf_file(payload: PdfRequest):
"""
Helper khusus jika user mengupload PDF dan ingin memilih tabel tertentu.
"""
try:
# Convert request body ke DataFrame (sesuai logic reader_pdf)
# Kita mock convert_df karena di kode asli import dari reader_pdf
# yang mungkin mengharapkan format dict khusus
df = convert_df(payload.model_dump())
if df is None or (hasattr(df, "empty") and df.empty):
raise HTTPException(status_code=422, detail="Tidak ada tabel valid dalam PDF")
# Reuse logic processor yang sama
return await analyze_and_clean_dataframe(
df, '.pdf', payload.fileName, payload.fileDesc
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
async def execute_postgis_ingestion(payload: UploadRequest, user_id: int):
"""
Orchestrator untuk endpoint /process-to-postgis.
1. Terima data (JSON rows).
2. Convert ke Parquet Temporary.
3. Upload ke PostGIS (via Repository).
4. Simpan Metadata (via Repository).
5. Trigger Cleansing & Publishing.
6. Logging.
"""
job_id = generate_job_id(str(user_id))
try:
# 1. Generate Nama Tabel
table_name = await generate_unique_table_name(payload.title)
# 2. Persiapan Data (JSON -> DataFrame -> Parquet)
# Kita perlu save ke parquet karena repository insert_parquet_to_postgis membaca file
# Ini juga memisahkan memory load antara API dan DB Process
df = pd.DataFrame(payload.rows)
# Upper case columns
df.columns = [col.upper() for col in df.columns]
# Rename Geometry jika perlu (standarisasi input dari frontend)
if "GEOMETRY" in df.columns:
df.rename(columns={"GEOMETRY": "GEOM"}, inplace=True)
# Simpan ke file temp untuk diproses repository
temp_parquet_name = f"{job_id}.parquet"
temp_parquet_path = os.path.join("tmp", temp_parquet_name)
os.makedirs("tmp", exist_ok=True)
# Save parquet (gunakan engine pyarrow atau fastparquet)
df.to_parquet(temp_parquet_path, index=False)
# 3. Insert ke PostGIS
# Fungsi ini akan membaca file parquet tadi, membersihkan geom, dan copy ke DB
db_result = await insert_parquet_to_postgis(temp_parquet_name, table_name)
# 4. Simpan Metadata
# Ambil list geom type dan row count dari hasil insert DB (lebih akurat)
final_geom_types = [db_result['geom_type']] # Disederhanakan jadi list
row_count = db_result['row_count']
await save_author_metadata(
payload_author=payload.author,
table_name=table_name,
dataset_title=payload.title,
geom_types=final_geom_types,
row_count=row_count,
user_id=user_id
)
# 5. Logging Activity
await log_activity(
user_id=user_id,
action_type="UPLOAD",
action_title=f"Upload dataset {table_name}",
details={"table_name": table_name, "rows": row_count}
)
# 6. Post-Processing (External APIs)
result = {
"job_id": job_id,
"job_status": "wait",
"table_name": table_name,
"status": "success",
"message": f"Tabel '{table_name}' berhasil dibuat.",
"total_rows": row_count,
"geometry_type": final_geom_types,
"crs": payload.author.get("crs", "EPSG:4326"),
"metadata_uuid": ""
}
# Save Style (SLD)
save_xml_to_sld(payload.style, job_id)
# CLEANSING WITH QUERY
try:
cleansing_status = await call_cleansing_procedure(table_name)
except Exception as e:
cleansing_status = "failed"
print(f"Cleansing warning: {e}")
result['job_status'] = cleansing_status
# Publish Layer (Geoserver/Geonetwork)
publish_info = await publish_mapset(table_name, job_id)
result['metadata_uuid'] = publish_info.get('uuid', '')
# 7. Upload to Main Portal (Mapset Integration)
mapset_payload = {
"name": payload.title,
"description": payload.author.get("abstract"),
"scale": "1:25000",
# ID Hardcoded sesuai kode asli (pertimbangkan pindah ke config/env)
'projection_system_id': '0196c746-d1ba-7f1c-9706-5df738679cc7',
"category_id": payload.author.get("mapsetCategory"),
"data_status": "sementara",
'classification_id': '01968b4b-d3f9-76c9-888c-ee887ac31ce4',
'producer_id': '01968b54-0000-7a67-bd10-975b8923b93e',
"layer_type": final_geom_types[0],
'source_id': ['019c03ef-35e1-738b-858d-871dc7d1e4d6'],
"layer_url": publish_info.get('geos_link', ''),
"metadata_url": f"{GEONETWORK_URL}/srv/eng/catalog.search#/metadata/{publish_info.get('uuid', '')}",
"coverage_level": "provinsi",
"coverage_area": "kabupaten",
"data_update_period": "Tahunan",
"data_version": "2026",
"is_popular": False,
"is_active": True,
'regional_id': '01968b53-a910-7a67-bd10-975b8923b92e',
"notes": "Mapset baru dibuat",
"status_validation": "on_verification",
}
# await upload_to_main(mapset_payload)
return result
except Exception as e:
# Error Handling & Logging
await log_activity(
user_id=user_id,
action_type="ERROR",
action_title="Upload gagal",
details={"error": str(e)}
)
print(f"[ERROR] execute_postgis_ingestion: {e}")
# Re-raise sebagai HTTP Exception agar router mengembalikan 500 yang rapi
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,105 @@
import os
import uuid
import zipfile
import geopandas as gpd
from shapely import wkt
from shapely.errors import ShapelyError
from datetime import datetime
def detect_zip_type(zip_path: str) -> str:
with zipfile.ZipFile(zip_path, "r") as zip_ref:
files = zip_ref.namelist()
if any(f.lower().endswith(".gdb/") or ".gdb/" in f.lower() for f in files):
return "gdb"
if any(f.lower().endswith(ext) for ext in [".gdbtable", ".gdbtablx", ".gdbindexes", ".spx"] for f in files):
return "gdb"
if any(f.lower().endswith(".shp") for f in files):
return "shp"
return "unknown"
def generate_unique_filename(folder="tmp", ext="parquet", digits=6):
os.makedirs(folder, exist_ok=True)
while True:
file_id = file_id = uuid.uuid4().int
filename = f"{folder}/{file_id}.{ext}"
if not os.path.exists(filename):
return filename
def generate_job_id(user_id: str) -> str:
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
return f"{user_id}_{timestamp}"
def dataframe_validation(df_input, tmp_file):
"""
Fungsi ini berjalan di thread terpisah (CPU bound).
Melakukan validasi, cleaning, dan export ke parquet.
"""
# 1. Copy agar tidak mengubah data asli
export_df = df_input.copy()
# =========================================================================
# TAHAP 1: SAFE WKT LOADING
# =========================================================================
def safe_load_wkt(raw):
if not isinstance(raw, str):
return None
try:
return wkt.loads(raw)
# 2. GANTI CATCH BLOCK INI
# except (WKTReadingError, Exception): <-- LAMA
except (ShapelyError, Exception):
return None
# Terapkan safe load
export_df["geom"] = export_df["geometry"].apply(safe_load_wkt)
# =========================================================================
# TAHAP 2: FILTER NULL & INVALID GEOMETRY
# =========================================================================
# Hapus baris di mana konversi WKT gagal (None)
export_df = export_df[export_df["geom"].notnull()]
print("df", export_df)
if export_df.empty:
raise ValueError("Tidak ada data spasial valid yang ditemukan.")
# Jadikan GeoDataFrame
export_df = gpd.GeoDataFrame(export_df, geometry="geom")
# =========================================================================
# TAHAP 3: FIX TOPOLOGY (PENTING!)
# =========================================================================
# Cek validitas (misal: Polygon yang garisnya menabrak diri sendiri)
# buffer(0) adalah trik standar GIS untuk memperbaiki topologi ringan
export_df["geom"] = export_df["geom"].apply(
lambda g: g.buffer(0) if not g.is_valid else g
)
# Hapus lagi jika setelah di-fix malah jadi kosong (jarang terjadi, tapi aman)
export_df = export_df[~export_df["geom"].is_empty]
# =========================================================================
# TAHAP 4: FINALISASI (CRS & RENAME)
# =========================================================================
export_df = export_df.drop(columns=["geometry"]) # Buang kolom string WKT lama
export_df = export_df.set_crs("EPSG:4326", allow_override=True)
# Rename kolom atribut ke UPPERCASE, biarkan 'geom' lowercase
# .strip() untuk membuang spasi hantu (" ID " -> "ID")
export_df = export_df.rename(
columns=lambda c: str(c).strip().upper() if c != "geom" else c
)
# Simpan ke Parquet
export_df.to_parquet(tmp_file)
return len(export_df)

View File

@ -0,0 +1,43 @@
import os
import pandas as pd
import numpy as np
from shapely.geometry import base as shapely_base
from shapely.geometry.base import BaseGeometry
from datetime import datetime
def safe_json(value):
"""Konversi aman untuk semua tipe numpy/pandas/shapely ke tipe JSON-serializable"""
if isinstance(value, (np.int64, np.int32)):
return int(value)
if isinstance(value, (np.float64, np.float32)):
return float(value)
if isinstance(value, pd.Timestamp):
return value.isoformat()
if isinstance(value, shapely_base.BaseGeometry):
return str(value) # convert to WKT string
if pd.isna(value):
return None
return value
def str_to_date(raw_date: str):
if raw_date:
try:
return datetime.strptime(raw_date, "%Y-%m-%d").date()
except Exception as e:
print("[WARNING] Tidak bisa parse dateCreated:", e)
return None
def save_xml_to_sld(xml_string, filename):
folder_path = 'style_temp'
os.makedirs(folder_path, exist_ok=True)
file_path = os.path.join(folder_path, f"{filename}.sld")
with open(file_path, "w", encoding="utf-8") as f:
f.write(xml_string)
return file_path

View File

@ -0,0 +1,208 @@
import re
import itertools
geo_admin_keywords = [
'lat', 'lon', 'long', 'latitude', 'longitude', 'koordinat', 'geometry', 'geometri',
'desa', 'kelurahan', 'kel', 'kecamatan', 'kabupaten', 'kab', 'kota', 'provinsi',
'lokasi', 'region', 'area', 'zone', 'boundary', 'batas'
]
def normalize_text(text):
text = text.lower()
text = re.sub(r'[^a-z0-9/ ]+', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def generate_combined_patterns(keywords):
combos = list(itertools.combinations(keywords, 2))
patterns = []
for a, b in combos:
patterns.append(rf'{a}\s*/\s*{b}')
patterns.append(rf'{b}\s*/\s*{a}')
return patterns
combined_patterns = generate_combined_patterns(geo_admin_keywords)
def contains_geo_admin_keywords(text):
text_clean = normalize_text(text)
if len(text_clean) < 3:
return False
for pattern in combined_patterns:
if re.search(pattern, text_clean):
return True
for kw in geo_admin_keywords:
if re.search(rf'(^|[\s/_-]){kw}([\s/_-]|$)', text_clean):
return True
return False
def filter_geo_admin_column(tables):
filtered = []
for table in tables:
found = any(contains_geo_admin_keywords(col) for col in table['columns'])
if found:
filtered.append(table)
return filtered
NUMBER_HEADER_KEYWORDS = [
"no","no.","nomor","nomor urut","no urut","No","Nomor","No Urut","Index",
"ID","Sr No","S/N","SN","Sl No"
]
def has_number_header(header):
header_text = header
return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS)
def is_numbering_column(col_values):
numeric_like = 0
total = 0
for v in col_values:
if not v or not isinstance(v, str):
continue
total += 1
if re.fullmatch(r"0*\d{1,3}", v.strip()):
numeric_like += 1
return total > 0 and (numeric_like / total) > 0.6
def is_numeric_value(v):
if v is None:
return False
if isinstance(v, (int, float)):
return True
if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()):
return True
return False
def cleaning_column(headers, bodies):
cleaned_bodies = []
for header, body in zip(headers, bodies):
if not body:
cleaned_bodies.append(body)
continue
header_has_number = has_number_header(header)
first_col = [row[0] for row in body if row and len(row) > 0]
first_col_is_numbering = is_numbering_column(first_col)
if not header_has_number and first_col_is_numbering:
new_body = []
for row in body:
if not row:
continue
first_val = row[0]
if is_numeric_value(first_val) and len(row) > 1:
new_body.append(row[1:])
else:
new_body.append(row)
body = new_body
header_len = len(headers)
filtered_body = [row for row in body if len(row) == header_len]
cleaned_bodies.append(filtered_body)
return cleaned_bodies
def parse_page_selection(selectedPage: str, total_pages: int):
if not selectedPage:
return list(range(1, total_pages + 1))
pages = set()
parts = re.split(r'[,\s]+', selectedPage.strip())
for part in parts:
if '-' in part:
try:
start, end = map(int, part.split('-'))
pages.update(range(start, end + 1))
except ValueError:
continue
else:
try:
pages.add(int(part))
except ValueError:
continue
valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages]
return valid_pages
def is_number(s):
if s is None:
return False
s = str(s).strip().replace(',', '').replace('.', '')
return s.isdigit()
def row_ratio(row):
non_empty = [c for c in row if c not in (None, '', ' ')]
if not non_empty:
return 0
num_count = sum(is_number(c) for c in non_empty)
return num_count / len(non_empty)
def has_mixed_text_and_numbers(row):
non_empty = [c for c in row if c not in (None, '', ' ')]
has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty)
has_num = any(is_number(c) for c in non_empty)
return has_text and has_num
def is_short_text_row(row):
"""Deteksi baris teks pendek (1-2 kolom teks pendek)."""
non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')]
if not non_empty:
return False
text_only = all(not is_number(c) for c in non_empty)
joined = " ".join(non_empty)
return text_only and len(non_empty) <= 2 and len(joined) < 20
def get_number_column_index(columns):
for i, col in enumerate(columns):
if has_number_header(col):
return i
return None
def get_start_end_number(rows, idx):
try:
start_no = int(rows[0][idx])
end_no = int(rows[-1][idx])
return start_no, end_no
except:
return None, None
def normalize_number_column(table):
columns = table["columns"]
rows = table["rows"]
num_idx = get_number_column_index(columns)
if num_idx is None:
return table
current = None
for row in rows:
try:
val = int(row[num_idx])
except:
continue
if current is None:
current = val
else:
if val <= current:
current += 1
else:
current = val
row[num_idx] = str(current)
return table

0
app/models/__init__.py Normal file → Executable file
View File

0
app/models/base.py Normal file → Executable file
View File

0
app/models/category_model.py Normal file → Executable file
View File

0
app/models/classification_model.py Normal file → Executable file
View File

0
app/models/credential_model.py Normal file → Executable file
View File

0
app/models/feedback_model.py Normal file → Executable file
View File

0
app/models/file_model.py Normal file → Executable file
View File

0
app/models/map_access_model.py Normal file → Executable file
View File

0
app/models/map_projection_system_model.py Normal file → Executable file
View File

0
app/models/map_source_model.py Normal file → Executable file
View File

0
app/models/mapset_history_model.py Normal file → Executable file
View File

0
app/models/mapset_model.py Normal file → Executable file
View File

0
app/models/news_model.py Normal file → Executable file
View File

0
app/models/organization_model.py Normal file → Executable file
View File

0
app/models/refresh_token_model.py Normal file → Executable file
View File

0
app/models/regional_model.py Normal file → Executable file
View File

0
app/models/role_model.py Normal file → Executable file
View File

0
app/models/user_model.py Normal file → Executable file
View File

0
app/repositories/__init__.py Normal file → Executable file
View File

0
app/repositories/base.py Normal file → Executable file
View File

0
app/repositories/category_repository.py Normal file → Executable file
View File

0
app/repositories/classification_repository.py Normal file → Executable file
View File

0
app/repositories/credential_repository.py Normal file → Executable file
View File

0
app/repositories/feedback_repository.py Normal file → Executable file
View File

0
app/repositories/file_repository.py Normal file → Executable file
View File

0
app/repositories/map_access_repository.py Normal file → Executable file
View File

0
app/repositories/map_projection_system_repository.py Normal file → Executable file
View File

0
app/repositories/map_source_repository.py Normal file → Executable file
View File

0
app/repositories/map_source_usage_repository.py Normal file → Executable file
View File

0
app/repositories/mapset_history_repository.py Normal file → Executable file
View File

0
app/repositories/mapset_repository.py Normal file → Executable file
View File

0
app/repositories/news_repository.py Normal file → Executable file
View File

0
app/repositories/organization_repository.py Normal file → Executable file
View File

0
app/repositories/regional_repository.py Normal file → Executable file
View File

0
app/repositories/role_repository.py Normal file → Executable file
View File

0
app/repositories/token_repository.py Normal file → Executable file
View File

0
app/repositories/user_repository.py Normal file → Executable file
View File

0
app/response/res.py Normal file → Executable file
View File

Some files were not shown because too many files have changed in this diff Show More