export
This commit is contained in:
parent
654ad382fe
commit
6c0d8729f7
0
.env.example
Normal file → Executable file
0
.env.example
Normal file → Executable file
3
.gitignore
vendored
Normal file → Executable file
3
.gitignore
vendored
Normal file → Executable file
|
|
@ -172,3 +172,6 @@ cython_debug/
|
|||
|
||||
# PyPI configuration file
|
||||
.pypirc
|
||||
|
||||
|
||||
tests/
|
||||
0
.pre-commit-config.yaml
Normal file → Executable file
0
.pre-commit-config.yaml
Normal file → Executable file
0
.python-version
Normal file → Executable file
0
.python-version
Normal file → Executable file
0
Dockerfile
Normal file → Executable file
0
Dockerfile
Normal file → Executable file
11
addons.txt
Executable file
11
addons.txt
Executable file
|
|
@ -0,0 +1,11 @@
|
|||
pandas = "^3.0.0"
|
||||
geopandas = "^1.1.2"
|
||||
fiona = "^1.10.1"
|
||||
numpy = "^2.4.2"
|
||||
pdfplumber = "^0.11.9"
|
||||
py7zr = "^1.1.0"
|
||||
pyogrio = "^0.12.1"
|
||||
rapidfuzz = "^3.14.3"
|
||||
requests = "^2.32.5"
|
||||
openpyxl = "^3.1.5"
|
||||
pyarrow = "21.0.0"
|
||||
0
alembic.ini
Normal file → Executable file
0
alembic.ini
Normal file → Executable file
0
app/__init__.py
Normal file → Executable file
0
app/__init__.py
Normal file → Executable file
0
app/api/dependencies/__init__.py
Normal file → Executable file
0
app/api/dependencies/__init__.py
Normal file → Executable file
0
app/api/dependencies/auth.py
Normal file → Executable file
0
app/api/dependencies/auth.py
Normal file → Executable file
0
app/api/dependencies/database.py
Normal file → Executable file
0
app/api/dependencies/database.py
Normal file → Executable file
0
app/api/dependencies/factory.py
Normal file → Executable file
0
app/api/dependencies/factory.py
Normal file → Executable file
0
app/api/v1/__init__.py
Normal file → Executable file
0
app/api/v1/__init__.py
Normal file → Executable file
0
app/api/v1/routes/__init__.py
Normal file → Executable file
0
app/api/v1/routes/__init__.py
Normal file → Executable file
0
app/api/v1/routes/auth_route.py
Normal file → Executable file
0
app/api/v1/routes/auth_route.py
Normal file → Executable file
0
app/api/v1/routes/category_route.py
Normal file → Executable file
0
app/api/v1/routes/category_route.py
Normal file → Executable file
0
app/api/v1/routes/classification_route.py
Normal file → Executable file
0
app/api/v1/routes/classification_route.py
Normal file → Executable file
0
app/api/v1/routes/count_route.py
Normal file → Executable file
0
app/api/v1/routes/count_route.py
Normal file → Executable file
0
app/api/v1/routes/credential_route.py
Normal file → Executable file
0
app/api/v1/routes/credential_route.py
Normal file → Executable file
0
app/api/v1/routes/feedback_route.py
Normal file → Executable file
0
app/api/v1/routes/feedback_route.py
Normal file → Executable file
0
app/api/v1/routes/file_route.py
Normal file → Executable file
0
app/api/v1/routes/file_route.py
Normal file → Executable file
0
app/api/v1/routes/geonetwork_route.py
Normal file → Executable file
0
app/api/v1/routes/geonetwork_route.py
Normal file → Executable file
0
app/api/v1/routes/map_projection_system_route.py
Normal file → Executable file
0
app/api/v1/routes/map_projection_system_route.py
Normal file → Executable file
0
app/api/v1/routes/map_source_route.py
Normal file → Executable file
0
app/api/v1/routes/map_source_route.py
Normal file → Executable file
0
app/api/v1/routes/mapset_history_route.py
Normal file → Executable file
0
app/api/v1/routes/mapset_history_route.py
Normal file → Executable file
0
app/api/v1/routes/mapset_route.py
Normal file → Executable file
0
app/api/v1/routes/mapset_route.py
Normal file → Executable file
0
app/api/v1/routes/news_route.py
Normal file → Executable file
0
app/api/v1/routes/news_route.py
Normal file → Executable file
0
app/api/v1/routes/organization_route.py
Normal file → Executable file
0
app/api/v1/routes/organization_route.py
Normal file → Executable file
0
app/api/v1/routes/regional_route.py
Normal file → Executable file
0
app/api/v1/routes/regional_route.py
Normal file → Executable file
0
app/api/v1/routes/role_route.py
Normal file → Executable file
0
app/api/v1/routes/role_route.py
Normal file → Executable file
0
app/api/v1/routes/user_route.py
Normal file → Executable file
0
app/api/v1/routes/user_route.py
Normal file → Executable file
0
app/core/__init__.py
Normal file → Executable file
0
app/core/__init__.py
Normal file → Executable file
0
app/core/config.py
Normal file → Executable file
0
app/core/config.py
Normal file → Executable file
0
app/core/data_types.py
Normal file → Executable file
0
app/core/data_types.py
Normal file → Executable file
0
app/core/database.py
Normal file → Executable file
0
app/core/database.py
Normal file → Executable file
0
app/core/exceptions.py
Normal file → Executable file
0
app/core/exceptions.py
Normal file → Executable file
0
app/core/minio_client.py
Normal file → Executable file
0
app/core/minio_client.py
Normal file → Executable file
0
app/core/params.py
Normal file → Executable file
0
app/core/params.py
Normal file → Executable file
0
app/core/responses.py
Normal file → Executable file
0
app/core/responses.py
Normal file → Executable file
0
app/core/security.py
Normal file → Executable file
0
app/core/security.py
Normal file → Executable file
0
app/main.py
Normal file → Executable file
0
app/main.py
Normal file → Executable file
0
app/mapset_pipeline/__init__,.py
Executable file
0
app/mapset_pipeline/__init__,.py
Executable file
41
app/mapset_pipeline/api/router.py
Executable file
41
app/mapset_pipeline/api/router.py
Executable file
|
|
@ -0,0 +1,41 @@
|
|||
# services/file_pipeline/router.py
|
||||
from fastapi import APIRouter, Depends, File, UploadFile, Form
|
||||
from .schemas import UploadRequest, PdfRequest
|
||||
from app.mapset_pipeline.service import handle_file_analysis, process_pdf_file, execute_postgis_ingestion
|
||||
from app.response.res import successRes, errorRes
|
||||
|
||||
router = APIRouter(prefix="/pipeline", tags=["File Pipeline"])
|
||||
|
||||
@router.post("/analyze")
|
||||
async def upload_file(
|
||||
file: UploadFile = File(...),
|
||||
page: str = Form(""),
|
||||
sheet: str = Form(""),
|
||||
fileDesc: str = Form("")
|
||||
):
|
||||
try:
|
||||
data = await handle_file_analysis(file, page, sheet, fileDesc)
|
||||
return successRes(data=data)
|
||||
except Exception as e:
|
||||
return errorRes(message="Upload failed", details=str(e), status_code=500)
|
||||
|
||||
|
||||
@router.post("/analyze/pdf")
|
||||
async def upload_file(
|
||||
payload: PdfRequest
|
||||
):
|
||||
try:
|
||||
res = await process_pdf_file(payload)
|
||||
return res
|
||||
except Exception as e:
|
||||
return errorRes(message="Upload failed", details=str(e), status_code=500)
|
||||
|
||||
|
||||
@router.post("/publish")
|
||||
async def process_to_postgis(payload: UploadRequest):
|
||||
# user_id bisa diambil dari dependency injection auth
|
||||
try:
|
||||
data = await execute_postgis_ingestion(payload, user_id=2)
|
||||
return successRes(data=data)
|
||||
except Exception as e:
|
||||
return errorRes(message="Processing failed", details=str(e), status_code=500)
|
||||
17
app/mapset_pipeline/api/schemas.py
Executable file
17
app/mapset_pipeline/api/schemas.py
Executable file
|
|
@ -0,0 +1,17 @@
|
|||
from pydantic import BaseModel
|
||||
from typing import List, Dict, Any
|
||||
|
||||
class PdfRequest(BaseModel):
|
||||
title: str
|
||||
columns: List[str]
|
||||
rows: List[List]
|
||||
fileName: str
|
||||
fileDesc: str
|
||||
|
||||
class UploadRequest(BaseModel):
|
||||
title: str
|
||||
path: str
|
||||
rows: List[dict]
|
||||
columns: List[str]
|
||||
author: Dict[str, Any]
|
||||
style: str
|
||||
49
app/mapset_pipeline/core/clients/ai_client.py
Executable file
49
app/mapset_pipeline/core/clients/ai_client.py
Executable file
|
|
@ -0,0 +1,49 @@
|
|||
import requests
|
||||
from typing import Dict, Any
|
||||
from app.core.config import GEN_AI_URL
|
||||
|
||||
URL = GEN_AI_URL
|
||||
|
||||
|
||||
def generate_metadata(payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"API_KEY": "testsatupeta"
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{URL}",
|
||||
json=payload,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
# response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Contoh payload
|
||||
payload = {
|
||||
"nama_file_peta": "peta bencana.pdf",
|
||||
"nama_opd": "Badan Penanggulangan Bencana Daerah (BPBD)",
|
||||
"tipe_data_spasial": "Multipolygon",
|
||||
"struktur_atribut_data": {},
|
||||
"metadata": {
|
||||
"judul": "",
|
||||
"abstrak": "",
|
||||
"tujuan": "",
|
||||
"keyword": [],
|
||||
"kategori": [],
|
||||
"kategori_mapset": ""
|
||||
}
|
||||
}
|
||||
|
||||
result = generate_metadata(payload)
|
||||
print(result)
|
||||
181
app/mapset_pipeline/core/processing/analyzer.py
Executable file
181
app/mapset_pipeline/core/processing/analyzer.py
Executable file
|
|
@ -0,0 +1,181 @@
|
|||
import os
|
||||
import asyncio
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
from app.response.res import errorRes
|
||||
|
||||
from app.mapset_pipeline.utils.file_ops import generate_unique_filename, dataframe_validation
|
||||
from app.mapset_pipeline.utils.formatters import safe_json
|
||||
from .geometry_build import is_geom_empty, detect_and_build_geometry, attach_polygon_geometry_auto
|
||||
from app.mapset_pipeline.core.clients.ai_client import generate_metadata
|
||||
from app.mapset_pipeline.core.publication.publish_geoserver import publish_layer_to_geoserver
|
||||
from app.mapset_pipeline.core.publication.publish_geonetwork import publish_metadata
|
||||
|
||||
async def analyze_and_clean_dataframe(df: pd.DataFrame, ext: str, filename: str, fileDesc: str):
|
||||
"""
|
||||
Fungsi utama untuk memproses DataFrame:
|
||||
1. Deteksi Geometri
|
||||
2. Validasi & Hitung Statistik
|
||||
3. Generate Preview & Warnings
|
||||
4. Generate Metadata (AI)
|
||||
5. Simpan ke Temporary Parquet
|
||||
"""
|
||||
|
||||
# 1. Deteksi Geometri
|
||||
result = detect_and_build_geometry(df, master_polygons=None)
|
||||
|
||||
if not hasattr(result, "geometry") or result.geometry.isna().all():
|
||||
result = attach_polygon_geometry_auto(result)
|
||||
|
||||
def normalize_geom_type(geom_type):
|
||||
if geom_type and geom_type.startswith("Multi"):
|
||||
return geom_type.replace("Multi", "")
|
||||
return geom_type
|
||||
|
||||
# 2. Analisis Tipe Geometri
|
||||
if isinstance(result, gpd.GeoDataFrame) and "geometry" in result.columns:
|
||||
geom_types = (
|
||||
result.geometry
|
||||
.dropna()
|
||||
.geom_type
|
||||
.apply(normalize_geom_type)
|
||||
.unique()
|
||||
)
|
||||
geom_type = geom_types[0] if len(geom_types) > 0 else "None"
|
||||
null_geom = result.geometry.isna().sum()
|
||||
|
||||
print(f"[INFO] Tipe Geometry: {geom_type}")
|
||||
print(f"[INFO] Jumlah geometry kosong: {null_geom}")
|
||||
else:
|
||||
# Fallback jika gagal mendeteksi geometry
|
||||
res = {
|
||||
"message": "Tidak menemukan tabel yang relevan atau kolom geometri.",
|
||||
"file_type": ext,
|
||||
"rows": len(df),
|
||||
"columns": len(df.columns),
|
||||
"geometry_valid": 0,
|
||||
"geometry_empty": 0,
|
||||
"geometry_valid_percent": 0,
|
||||
"warnings": [],
|
||||
"warning_examples": [],
|
||||
"preview": []
|
||||
}
|
||||
# Kita raise error dictionary agar bisa ditangkap oleh router/service
|
||||
# Atau return dictionary error structure
|
||||
return errorRes(message="Tidak berhasil mencocokan geometry pada tabel.", details=res, status_code=422)
|
||||
|
||||
# 3. Cleaning Data Values
|
||||
result = result.replace([pd.NA, float('inf'), float('-inf')], None)
|
||||
|
||||
# Convert Geometry ke WKT untuk analisis teks
|
||||
if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns:
|
||||
# Kita perlu simpan WKT string agar serializable saat preview
|
||||
# Tapi biarkan geometry asli untuk proses parquet nanti
|
||||
pass
|
||||
|
||||
# Hitung Statistik Validitas
|
||||
empty_count = result['geometry'].apply(is_geom_empty).sum()
|
||||
valid_count = len(result) - empty_count
|
||||
match_percentage = (valid_count / len(result)) * 100
|
||||
|
||||
warnings = []
|
||||
if empty_count > 0:
|
||||
warnings.append(
|
||||
f"{empty_count} dari {len(result)} baris tidak memiliki geometry yang valid "
|
||||
f"({100 - match_percentage:.2f}% data gagal cocok)."
|
||||
)
|
||||
|
||||
# Ambil contoh data error
|
||||
if empty_count > 0:
|
||||
examples = result[result['geometry'].apply(is_geom_empty)].head(500)
|
||||
warning_examples = examples.to_dict(orient="records")
|
||||
else:
|
||||
warning_examples = []
|
||||
|
||||
# Prepare Preview Data (Convert WKT for JSON response)
|
||||
# Kita copy agar tidak merusak dataframe utama
|
||||
data_df = result.copy()
|
||||
if 'geometry' in data_df.columns:
|
||||
data_df['geometry'] = data_df['geometry'].apply(
|
||||
lambda g: g.wkt if g is not None else None
|
||||
)
|
||||
|
||||
preview_data = data_df.to_dict(orient="records")
|
||||
|
||||
# Sanitasi JSON (numpy types -> python types)
|
||||
preview_safe = [
|
||||
{k: safe_json(v) for k, v in row.items()} for row in preview_data
|
||||
]
|
||||
|
||||
warning_safe = [
|
||||
{k: safe_json(v) for k, v in row.items()} for row in warning_examples
|
||||
]
|
||||
|
||||
# 4. AI Metadata Generation
|
||||
ai_context = {
|
||||
"nama_file_peta": filename,
|
||||
"nama_opd": "Badan Penanggulangan Bencana Daerah (BPBD) Provinsi Jatim", # Sebaiknya dinamis
|
||||
"tipe_data_spasial": geom_type,
|
||||
"deskripsi_singkat": fileDesc,
|
||||
"struktur_atribut_data": {},
|
||||
}
|
||||
|
||||
try:
|
||||
ai_suggest = generate_metadata(ai_context)
|
||||
except Exception as e:
|
||||
print(f"[WARNING] Gagal generate metadata AI: {e}")
|
||||
ai_suggest = {}
|
||||
|
||||
# 5. Simpan ke Temporary Parquet
|
||||
# Gunakan filename unik agar thread safe
|
||||
tmp_file = generate_unique_filename(folder="tmp", ext="parquet")
|
||||
|
||||
# Proses konversi synchronous dijalankan di thread terpisah agar tidak blocking
|
||||
print('start')
|
||||
await asyncio.to_thread(dataframe_validation, data_df, tmp_file)
|
||||
print('pass')
|
||||
|
||||
response = {
|
||||
"message": "File berhasil dibaca dan dianalisis.",
|
||||
"file_name": filename,
|
||||
"file_type": ext,
|
||||
"rows": int(len(result)),
|
||||
"columns": list(map(str, result.columns)),
|
||||
"geometry_valid": int(valid_count),
|
||||
"geometry_empty": int(empty_count),
|
||||
"geometry_valid_percent": float(round(match_percentage, 2)),
|
||||
"geometry_type": geom_type,
|
||||
"warnings": warnings,
|
||||
"warning_rows": warning_safe,
|
||||
"preview": preview_safe,
|
||||
"metadata_suggest": ai_suggest,
|
||||
"tmp_path": tmp_file
|
||||
}
|
||||
|
||||
return response
|
||||
|
||||
|
||||
async def publish_mapset(table_name: str, job_id: str):
|
||||
try:
|
||||
|
||||
geos_link = publish_layer_to_geoserver(table_name, job_id)
|
||||
|
||||
uuid = await publish_metadata(
|
||||
table_name=table_name,
|
||||
geoserver_links=geos_link
|
||||
)
|
||||
|
||||
# await update_job_status(table_name, "FINISHED", job_id)
|
||||
|
||||
# return uuid
|
||||
return {
|
||||
"geos_link": geos_link["layer_url"],
|
||||
# "uuid": uuid
|
||||
"uuid": "123123"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
# await update_job_status(table_name, "FAILED", job_id)
|
||||
raise RuntimeError(f"Publish layer gagal: {e}") from e
|
||||
|
||||
|
||||
466
app/mapset_pipeline/core/processing/geometry_build.py
Executable file
466
app/mapset_pipeline/core/processing/geometry_build.py
Executable file
|
|
@ -0,0 +1,466 @@
|
|||
import geopandas as gpd
|
||||
from shapely.geometry import Point, LineString
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import re
|
||||
import os
|
||||
from shapely import wkt
|
||||
from rapidfuzz import process, fuzz
|
||||
from sqlalchemy import create_engine
|
||||
from shapely.geometry.base import BaseGeometry
|
||||
from app.core.config import REFERENCE_DB_URL, REFERENCE_SCHEMA, DESA_REF, KEC_REF, KAB_REF
|
||||
|
||||
# ============================================================
|
||||
# KONFIGURASI DAN KONSTANTA
|
||||
# ============================================================
|
||||
|
||||
COLUMN_ALIASES = {
|
||||
'desa': ['desa', 'kelurahan', 'desa_kelurahan', 'desa/kelurahan', 'nama_desa', 'nama_kelurahan', 'Desa/Kel'],
|
||||
'kecamatan': ['kec', 'kecamatan', 'nama_kec', 'nama_kecamatan'],
|
||||
'kabupaten': ['kab', 'kabupaten', 'kota', 'kabupaten_kota', 'kota_kabupaten', 'kab/kota', 'kota/kabupaten', 'kota/kab']
|
||||
}
|
||||
|
||||
# ============================================================
|
||||
# FUNGSI BANTU ADMINISTRATIF
|
||||
# ============================================================
|
||||
|
||||
def find_admin_column(df, aliases):
|
||||
"""Mencari kolom yang paling cocok untuk tiap level admin (desa/kec/kab)"""
|
||||
matched = {}
|
||||
for level, alias_list in aliases.items():
|
||||
for col in df.columns:
|
||||
col_norm = col.strip().lower().replace(' ', '_').replace('/', '_')
|
||||
if any(alias in col_norm for alias in alias_list):
|
||||
matched[level] = col
|
||||
break
|
||||
return matched
|
||||
|
||||
|
||||
def detect_smallest_admin_level(df):
|
||||
"""Mendeteksi level administratif terkecil yang ada di DataFrame"""
|
||||
cols = [c.lower() for c in df.columns]
|
||||
if any('desa' in c or 'kelurahan' in c for c in cols):
|
||||
return 'desa'
|
||||
elif any('kecamatan' in c for c in cols):
|
||||
return 'kecamatan'
|
||||
elif any('kab' in c or 'kota' in c for c in cols):
|
||||
return 'kabupaten'
|
||||
return None
|
||||
|
||||
|
||||
def fuzzy_merge(df, master, left_key, right_key, threshold=85):
|
||||
"""Melakukan fuzzy matching antar nama wilayah"""
|
||||
matches = df[left_key].apply(
|
||||
lambda x: process.extractOne(str(x), master[right_key], score_cutoff=threshold)
|
||||
)
|
||||
df['match'] = matches.apply(lambda m: m[0] if m else None)
|
||||
merged = df.merge(master, left_on='match', right_on=right_key, how='left')
|
||||
return merged
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def normalize_name(name: str, level: str = None):
|
||||
if not isinstance(name, str):
|
||||
return None
|
||||
|
||||
name = name.strip()
|
||||
if not name:
|
||||
return None
|
||||
|
||||
name = re.sub(r'\s*\([^)]*\)\s*', '', name)
|
||||
|
||||
raw = name.lower()
|
||||
raw = re.sub(r'^(desa|kelurahan|kel|dusun|kampung)\s+', '', raw)
|
||||
raw = re.sub(r'^(kecamatan|kec)\s+', '', raw)
|
||||
raw = re.sub(r'^(kabupaten|kab\.?|kab)\s+', '', raw)
|
||||
|
||||
if level in ["kabupaten", "kota"]:
|
||||
raw = re.sub(r'^(kota\s+)', '', raw)
|
||||
|
||||
raw = re.sub(r'[^a-z\s]', '', raw)
|
||||
raw = re.sub(r'\s+', ' ', raw).strip()
|
||||
|
||||
tokens = raw.split()
|
||||
|
||||
merged_tokens = []
|
||||
i = 0
|
||||
while i < len(tokens):
|
||||
if i < len(tokens) - 1:
|
||||
sim = fuzz.ratio(tokens[i], tokens[i + 1])
|
||||
if sim > 75:
|
||||
merged_tokens.append(tokens[i] + tokens[i + 1])
|
||||
i += 2
|
||||
continue
|
||||
merged_tokens.append(tokens[i])
|
||||
i += 1
|
||||
|
||||
cleaned_tokens = []
|
||||
prev = None
|
||||
for tok in merged_tokens:
|
||||
if prev and fuzz.ratio(prev, tok) > 95:
|
||||
continue
|
||||
cleaned_tokens.append(tok)
|
||||
prev = tok
|
||||
|
||||
raw = " ".join(cleaned_tokens)
|
||||
formatted = raw.title()
|
||||
|
||||
if level in ["kabupaten", "kota"]:
|
||||
if "kota" in name.lower():
|
||||
if not formatted.startswith("Kota "):
|
||||
formatted = f"Kota {formatted}"
|
||||
else:
|
||||
formatted = formatted.replace("Kota ", "")
|
||||
|
||||
return formatted
|
||||
|
||||
|
||||
|
||||
|
||||
def is_geom_empty(g):
|
||||
"""True jika geometry None, NaN, atau geometry Shapely kosong."""
|
||||
if g is None:
|
||||
return True
|
||||
if isinstance(g, float) and pd.isna(g):
|
||||
return True
|
||||
if isinstance(g, BaseGeometry):
|
||||
return g.is_empty
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
import math
|
||||
|
||||
def normalize_lon(val, is_lat=False):
|
||||
if pd.isna(val):
|
||||
return None
|
||||
try:
|
||||
v = float(val)
|
||||
except:
|
||||
return None
|
||||
|
||||
av = abs(v)
|
||||
if av == 0:
|
||||
return v
|
||||
|
||||
if (-180 <= v <= 180 and not is_lat) or (-90 <= v <= 90 and is_lat):
|
||||
return v
|
||||
|
||||
for factor in [1, 10, 100, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9]:
|
||||
nv = v / factor
|
||||
if (not is_lat and -180 <= nv <= 180) or (is_lat and -90 <= nv <= 90):
|
||||
return nv
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def normalize_lat(val):
|
||||
if pd.isna(val):
|
||||
return None
|
||||
v = float(val)
|
||||
av = abs(v)
|
||||
if av > 1e9: # contoh: -8167413802 (10 digit)
|
||||
return v / 1e9
|
||||
elif av > 1e8: # fallback jika ada variasi
|
||||
return v / 1e8
|
||||
else:
|
||||
return v
|
||||
|
||||
|
||||
# ============================================================
|
||||
# FUNGSI UTAMA GEOMETRY DETECTION (LAT/LON / PATH)
|
||||
# ============================================================
|
||||
def detect_and_build_geometry(df: pd.DataFrame, master_polygons: gpd.GeoDataFrame = None):
|
||||
"""
|
||||
Mendeteksi dan membentuk geometry dari DataFrame.
|
||||
Bisa dari lat/lon, WKT, atau join ke master polygon (jika disediakan).
|
||||
"""
|
||||
|
||||
if isinstance(df, gpd.GeoDataFrame):
|
||||
geom_cols = [
|
||||
c for c in df.columns
|
||||
if re.match(r'^(geometry|geom|the_geom|wkb_geometry)$', c, re.IGNORECASE)
|
||||
or c.lower().startswith("geom")
|
||||
or c.lower().endswith("geometry")
|
||||
]
|
||||
# if "geometry" in df.columns and df.geometry.notna().any():
|
||||
if geom_cols:
|
||||
geom_count = df.geometry.notna().sum()
|
||||
geom_type = list(df.geom_type.unique())
|
||||
print(f"[INFO] Detected existing geometry in GeoDataFrame ({geom_count} features, {geom_type}).")
|
||||
return df
|
||||
|
||||
lat_col = next((c for c in df.columns if re.search(r'\b(lat|latitude|y[_\s]*coord|y$)\b', c.lower())), None)
|
||||
lon_col = next((c for c in df.columns if re.search(r'\b(lon|long|longitude|x[_\s]*coord|x$)\b', c.lower())), None)
|
||||
|
||||
if lat_col and lon_col:
|
||||
df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
|
||||
df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
|
||||
|
||||
df[lon_col] = df[lon_col].apply(lambda x: normalize_lon(x, is_lat=False))
|
||||
df[lat_col] = df[lat_col].apply(normalize_lat)
|
||||
|
||||
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326")
|
||||
print("[INFO] Geometry dibangun dari kolom lat/lon.")
|
||||
return gdf
|
||||
|
||||
coord_col = next(
|
||||
(c for c in df.columns if re.search(r'(geom|geometry|wkt|shp|shape|path|coord)', c.lower())), None
|
||||
)
|
||||
|
||||
if coord_col and df[coord_col].notnull().any():
|
||||
sample_val = str(df[coord_col].dropna().iloc[0]).strip()
|
||||
|
||||
if sample_val.startswith('['):
|
||||
def parse_geom(val):
|
||||
try:
|
||||
pts = eval(val)
|
||||
return LineString(pts)
|
||||
except Exception:
|
||||
return None
|
||||
df['geometry'] = df[coord_col].apply(parse_geom)
|
||||
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
|
||||
print("[INFO] Geometry dibangun dari kolom koordinat/path (list of points).")
|
||||
return gdf
|
||||
|
||||
elif any(x in sample_val.upper() for x in ["POINT", "LINESTRING", "POLYGON"]):
|
||||
try:
|
||||
df['geometry'] = df[coord_col].apply(
|
||||
lambda g: wkt.loads(g) if isinstance(g, str) and any(
|
||||
x in g.upper() for x in ["POINT", "LINESTRING", "POLYGON"]
|
||||
) else None
|
||||
)
|
||||
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
|
||||
print("[INFO] Geometry dibangun dari kolom WKT (Point/Line/Polygon/MultiPolygon).")
|
||||
return gdf
|
||||
except Exception as e:
|
||||
print(f"[WARN] Gagal parsing kolom geometry sebagai WKT: {e}")
|
||||
|
||||
|
||||
|
||||
if master_polygons is not None:
|
||||
df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_').str.replace('/', '_')
|
||||
matches = find_admin_column(df, COLUMN_ALIASES)
|
||||
|
||||
if 'desa' in matches:
|
||||
admin_col = matches['desa']
|
||||
merged = df.merge(master_polygons, left_on=admin_col, right_on='nama_desa', how='left')
|
||||
if merged['geometry'].isna().sum() > 0:
|
||||
merged = fuzzy_merge(df, master_polygons, admin_col, 'nama_desa')
|
||||
gdf = gpd.GeoDataFrame(merged, geometry='geometry', crs=master_polygons.crs)
|
||||
return gdf
|
||||
|
||||
elif 'kecamatan' in matches:
|
||||
admin_col = matches['kecamatan']
|
||||
merged = df.merge(master_polygons, left_on=admin_col, right_on='nama_kecamatan', how='left')
|
||||
gdf = gpd.GeoDataFrame(merged, geometry='geometry', crs=master_polygons.crs)
|
||||
return gdf
|
||||
|
||||
elif 'kabupaten' in matches:
|
||||
admin_col = matches['kabupaten']
|
||||
merged = df.merge(master_polygons, left_on=admin_col, right_on='nama_kabupaten', how='left')
|
||||
gdf = gpd.GeoDataFrame(merged, geometry='geometry', crs=master_polygons.crs)
|
||||
return gdf
|
||||
|
||||
print("[WARN] Tidak ditemukan geometry (lat/lon, path, atau master).")
|
||||
return df
|
||||
|
||||
|
||||
# def get_reference_polygons(level):
|
||||
# """Mengambil data batas wilayah (MultiPolygon) dari DB referensi"""
|
||||
# table_map = {
|
||||
# 'desa': f"{REFERENCE_SCHEMA}.administrasi_ar_keldesa_jatim",
|
||||
# 'kecamatan': f"{REFERENCE_SCHEMA}.administrasi_ar_kec_jatim",
|
||||
# 'kabupaten': f"{REFERENCE_SCHEMA}.administrasi_ar_kabkot_jatim"
|
||||
# }
|
||||
|
||||
# table_name = table_map.get(level)
|
||||
# if not table_name:
|
||||
# raise ValueError(f"Tidak ada tabel referensi untuk level '{level}'.")
|
||||
|
||||
# engine = create_engine(REFERENCE_DB_URL)
|
||||
# query = f"SELECT *, ST_Multi(geom) AS geometry FROM {table_name}"
|
||||
# gdf = gpd.read_postgis(query, engine, geom_col='geometry')
|
||||
|
||||
# print(f"[INFO] {len(gdf)} data referensi '{level}' berhasil dimuat dari {table_name}.")
|
||||
# return gdf
|
||||
|
||||
|
||||
from functools import lru_cache
|
||||
|
||||
@lru_cache(maxsize=3)
|
||||
def get_reference_polygons(level):
|
||||
local_path = f"cache/{level}_ref.parquet"
|
||||
if os.path.exists(local_path):
|
||||
print(f"[CACHE] Memuat referensi '{level}' dari file lokal.")
|
||||
return gpd.read_parquet(local_path)
|
||||
|
||||
print(f"[DB] Mengambil data referensi '{level}' dari database...")
|
||||
table_map = {
|
||||
"desa": f"{REFERENCE_SCHEMA}.administrasi_ar_keldesa_jatim",
|
||||
"kecamatan": f"{REFERENCE_SCHEMA}.administrasi_ar_kec_jatim",
|
||||
"kabupaten": f"{REFERENCE_SCHEMA}.administrasi_ar_kabkot_jatim"
|
||||
}
|
||||
table_name = table_map.get(level)
|
||||
engine = create_engine(REFERENCE_DB_URL)
|
||||
query = f"SELECT *, ST_Multi(geom) AS geometry FROM {table_name}"
|
||||
gdf = gpd.read_postgis(query, engine, geom_col="geometry")
|
||||
gdf.to_parquet(local_path)
|
||||
print(f"[CACHE] Disimpan ke {local_path}")
|
||||
return gdf
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Optimize Join
|
||||
# ============================================================
|
||||
def build_join_key(df, cols):
|
||||
arr = df[cols].astype(str).replace("nan", "", regex=False).to_numpy()
|
||||
return np.char.add.reduce(np.column_stack(
|
||||
[arr[:, i] + ("|" if i < len(cols) - 1 else "") for i in range(len(cols))]
|
||||
), axis=1)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# FUNGSI: AUTO ATTACH POLYGON KE DATAFRAME NON-SPASIAL
|
||||
# ============================================================
|
||||
def attach_polygon_geometry_auto(df: pd.DataFrame):
|
||||
"""
|
||||
Tambahkan kolom geometry MultiPolygon berdasarkan kombinasi
|
||||
(desa/kelurahan + kecamatan + kabupaten/kota), tanpa duplikasi baris.
|
||||
"""
|
||||
level = detect_smallest_admin_level(df)
|
||||
if not level:
|
||||
print("[WARN] Tidak ditemukan kolom administratif (desa/kecamatan/kabupaten).")
|
||||
return df
|
||||
|
||||
print(f"[INFO] Detected smallest admin level: {level}")
|
||||
ref_gdf = get_reference_polygons(level)
|
||||
|
||||
desa_col = next((c for c in df.columns if any(x in c.lower() for x in ['desa', 'kelurahan'])), None)
|
||||
kec_col = next((c for c in df.columns if 'kec' in c.lower()), None)
|
||||
kab_col = next((c for c in df.columns if any(x in c.lower() for x in ['kab', 'kota'])), None)
|
||||
|
||||
if desa_col and (not kec_col or not kab_col):
|
||||
print("[ERROR] Kolom 'Desa' ditemukan tetapi kolom 'Kecamatan' dan/atau 'Kabupaten' tidak lengkap.")
|
||||
print(f"[DEBUG] Ditemukan: Desa={desa_col}, Kec={kec_col}, Kab={kab_col}")
|
||||
return df
|
||||
|
||||
elif not desa_col and kec_col and not kab_col:
|
||||
print("[ERROR] Kolom 'Kecamatan' ditemukan tetapi kolom 'Kabupaten/Kota' tidak ditemukan.")
|
||||
print(f"[DEBUG] Ditemukan: Desa={desa_col}, Kec={kec_col}, Kab={kab_col}")
|
||||
return df
|
||||
|
||||
elif kab_col and not desa_col and not kec_col :
|
||||
print("[INFO] Struktur kolom administratif valid (minimal Kabupaten/Kota ditemukan).")
|
||||
print(f"[DEBUG] Ditemukan: Desa={desa_col}, Kec={kec_col}, Kab={kab_col}")
|
||||
|
||||
elif not desa_col and not kec_col and not kab_col:
|
||||
print("[WARN] Tidak ditemukan kolom administratif apapun (Desa/Kecamatan/Kabupaten).")
|
||||
print(f"[DEBUG] Kolom CSV: {list(df.columns)}")
|
||||
return df
|
||||
|
||||
# kolom di referensi
|
||||
desa_ref = DESA_REF
|
||||
kec_ref = KEC_REF
|
||||
kab_ref = KAB_REF
|
||||
|
||||
if desa_col is not None:
|
||||
df[desa_col] = df[desa_col].astype(str).apply(lambda x: normalize_name(x, "desa"))
|
||||
|
||||
if kec_col is not None:
|
||||
df[kec_col] = df[kec_col].astype(str).apply(lambda x: normalize_name(x, "kecamatan"))
|
||||
|
||||
if kab_col is not None:
|
||||
df[kab_col] = df[kab_col].astype(str).apply(lambda x: normalize_name(x, "kabupaten"))
|
||||
|
||||
|
||||
if desa_ref is not None:
|
||||
ref_gdf[desa_ref] = ref_gdf[desa_ref].astype(str).apply(lambda x: normalize_name(x, "desa"))
|
||||
|
||||
if kec_ref is not None:
|
||||
ref_gdf[kec_ref] = ref_gdf[kec_ref].astype(str).apply(lambda x: normalize_name(x, "kecamatan"))
|
||||
|
||||
if kab_ref is not None:
|
||||
ref_gdf[kab_ref] = ref_gdf[kab_ref].astype(str).apply(lambda x: normalize_name(x, "kabupaten"))
|
||||
|
||||
|
||||
|
||||
|
||||
join_cols = [col for col in [desa_col, kec_col, kab_col] if col]
|
||||
|
||||
if not join_cols:
|
||||
print("[ERROR] Tidak ada kolom administratif yang bisa digunakan untuk join key.")
|
||||
else:
|
||||
join_cols_df = [col for col in [desa_col, kec_col, kab_col] if col]
|
||||
join_cols_ref = [col for col in [desa_ref, kec_ref, kab_ref] if col]
|
||||
|
||||
common_depth = min(len(join_cols_df), len(join_cols_ref))
|
||||
join_cols_df = join_cols_df[-common_depth:]
|
||||
join_cols_ref = join_cols_ref[-common_depth:]
|
||||
|
||||
# print(f"[DEBUG] Join kolom DF : {join_cols_df}")
|
||||
# print(f"[DEBUG] Join kolom REF : {join_cols_ref}")
|
||||
|
||||
# df["_join_key"] = df[join_cols_df].astype(str).agg("|".join, axis=1)
|
||||
# ref_gdf["_join_key"] = ref_gdf[join_cols_ref].astype(str).agg("|".join, axis=1)
|
||||
|
||||
df["_join_key"] = build_join_key(df, join_cols_df)
|
||||
ref_gdf["_join_key"] = build_join_key(ref_gdf, join_cols_ref)
|
||||
|
||||
|
||||
# print(f"[INFO] Join key berhasil dibuat dari kolom: {join_cols_df}")
|
||||
|
||||
ref_lookup = ref_gdf[["_join_key", "geometry"]].drop_duplicates(subset=["_join_key"])
|
||||
df = df.merge(ref_lookup, how="left", on="_join_key")
|
||||
matched = df["geometry"].notna().sum()
|
||||
# print(f"[INFO] {matched} dari {len(df)} baris cocok langsung berdasarkan (desa + kec + kab/kota).")
|
||||
|
||||
if matched < len(df):
|
||||
unmatched = df[df["geometry"].isna()]
|
||||
# print(f"[INFO] Melakukan fuzzy match untuk {len(unmatched)} baris yang belum cocok...")
|
||||
|
||||
ref_dict = dict(zip(ref_lookup["_join_key"], ref_lookup["geometry"]))
|
||||
|
||||
def find_fuzzy_geom(row):
|
||||
key = row["_join_key"]
|
||||
if not isinstance(key, str):
|
||||
return None
|
||||
# fuzzy old
|
||||
# match = process.extractOne(key, list(ref_dict.keys()), scorer=fuzz.token_sort_ratio)
|
||||
# fuzzy new
|
||||
match = process.extractOne(
|
||||
key, list(ref_dict.keys()), scorer=fuzz.token_set_ratio, score_cutoff=80
|
||||
)
|
||||
|
||||
if match and match[1] >= 85:
|
||||
return ref_dict[match[0]]
|
||||
return None
|
||||
|
||||
df.loc[df["geometry"].isna(), "geometry"] = df[df["geometry"].isna()].apply(find_fuzzy_geom, axis=1)
|
||||
|
||||
df = df.drop(columns=["_join_key"], errors="ignore")
|
||||
|
||||
# admin_cols = [col for col in [desa_col, kec_col, kab_col] if col and col in df.columns]
|
||||
# if matched < len(df):
|
||||
# diff = df[df['geometry'].isna()][admin_cols]
|
||||
|
||||
# print("[DEBUG] Baris yang tidak match:")
|
||||
# if diff.empty:
|
||||
# print("(semua baris berhasil match)")
|
||||
# else:
|
||||
# print(diff.to_string(index=False))
|
||||
|
||||
|
||||
# print(f"[REPORT] Total match: {df['geometry'].notna().sum()} / {len(df)} ({df['geometry'].notna().mean()*100:.2f}%)")
|
||||
|
||||
|
||||
return gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
|
||||
693
app/mapset_pipeline/core/publication/publish_geonetwork.py
Executable file
693
app/mapset_pipeline/core/publication/publish_geonetwork.py
Executable file
|
|
@ -0,0 +1,693 @@
|
|||
from fastapi import HTTPException
|
||||
import requests
|
||||
from sqlalchemy import text
|
||||
from app.core.config import GEONETWORK_PASS, GEONETWORK_URL, GEONETWORK_USER
|
||||
from database.connection import engine
|
||||
from datetime import datetime
|
||||
from uuid import uuid4
|
||||
import re
|
||||
|
||||
|
||||
|
||||
def create_gn_session():
|
||||
session = requests.Session()
|
||||
session.auth = (GEONETWORK_USER, GEONETWORK_PASS)
|
||||
|
||||
session.get(f"{GEONETWORK_URL}/srv/eng/info?type=me")
|
||||
xsrf_token = session.cookies.get("XSRF-TOKEN")
|
||||
|
||||
if not xsrf_token:
|
||||
raise Exception("XSRF token missing")
|
||||
|
||||
return session, xsrf_token
|
||||
|
||||
|
||||
|
||||
def escape_url_params(url: str) -> str:
|
||||
"""
|
||||
Escape karakter berbahaya di dalam URL agar valid dalam XML.
|
||||
Khususnya mengganti '&' menjadi '&' kecuali jika sudah '&'.
|
||||
"""
|
||||
# Ganti semua & yang bukan bagian dari &
|
||||
url = re.sub(r'&(?!amp;)', '&', url)
|
||||
return url
|
||||
|
||||
|
||||
def fix_xml_urls(xml: str) -> str:
|
||||
"""
|
||||
Temukan semua <gmd:URL> ... </gmd:URL> dalam XML dan escape URL-nya.
|
||||
"""
|
||||
def replacer(match):
|
||||
original = match.group(1).strip()
|
||||
fixed = escape_url_params(original)
|
||||
return f"<gmd:URL>{fixed}</gmd:URL>"
|
||||
|
||||
# Replace semua <gmd:URL> ... </gmd:URL>
|
||||
xml_fixed = re.sub(
|
||||
r"<gmd:URL>(.*?)</gmd:URL>",
|
||||
replacer,
|
||||
xml,
|
||||
flags=re.DOTALL
|
||||
)
|
||||
|
||||
return xml_fixed
|
||||
|
||||
|
||||
|
||||
async def get_extent(table_name: str):
|
||||
|
||||
sql = f"""
|
||||
SELECT
|
||||
ST_XMin(extent), ST_YMin(extent),
|
||||
ST_XMax(extent), ST_YMax(extent)
|
||||
FROM (
|
||||
SELECT ST_Extent(geom) AS extent
|
||||
FROM public."{table_name}"
|
||||
) AS box;
|
||||
"""
|
||||
|
||||
async with engine.connect() as conn:
|
||||
result = await conn.execute(sql)
|
||||
row = result.fetchone()
|
||||
|
||||
if not row or row[0] is None:
|
||||
return None
|
||||
|
||||
# return {
|
||||
# "xmin": row[0],
|
||||
# "ymin": row[1],
|
||||
# "xmax": row[2],
|
||||
# "ymax": row[3]
|
||||
# }
|
||||
|
||||
return {
|
||||
"xmin": 110.1372, # west
|
||||
"ymin": -9.3029, # south
|
||||
"xmax": 114.5287, # east
|
||||
"ymax": -5.4819 # north
|
||||
}
|
||||
|
||||
async def get_author_metadata(table_name: str):
|
||||
|
||||
sql = """
|
||||
SELECT am.table_title, am.dataset_title, am.dataset_abstract, am.keywords, am.date_created,
|
||||
am.organization_name, am.contact_person_name, am.created_at,
|
||||
am.contact_email, am.contact_phone, am.geom_type,
|
||||
u.organization_id,
|
||||
o.address AS organization_address,
|
||||
o.email AS organization_email,
|
||||
o.phone_number AS organization_phone
|
||||
FROM backend.author_metadata AS am
|
||||
LEFT JOIN backend.users u ON am.user_id = u.id
|
||||
LEFT JOIN backend.organizations o ON u.organization_id = o.id
|
||||
WHERE am.table_title = :table
|
||||
LIMIT 1
|
||||
"""
|
||||
|
||||
async with engine.connect() as conn:
|
||||
result = await conn.execute(sql, {"table": table_name})
|
||||
row = result.fetchone()
|
||||
|
||||
if not row:
|
||||
raise Exception(f"Tidak ada metadata untuk tabel: {table_name}")
|
||||
|
||||
# SQLAlchemy Async row support ._mapping untuk convert ke dict
|
||||
return dict(row._mapping)
|
||||
|
||||
|
||||
def map_geom_type(gtype):
|
||||
|
||||
if gtype is None:
|
||||
return "surface"
|
||||
|
||||
# Jika LIST → ambil elemen pertama
|
||||
if isinstance(gtype, list):
|
||||
if len(gtype) > 0:
|
||||
gtype = gtype[0]
|
||||
else:
|
||||
return "surface"
|
||||
|
||||
# Setelah pasti string
|
||||
gtype = str(gtype).lower()
|
||||
|
||||
if "polygon" in gtype or "multi" in gtype:
|
||||
return "surface"
|
||||
if "line" in gtype:
|
||||
return "curve"
|
||||
if "point" in gtype:
|
||||
return "point"
|
||||
|
||||
return "surface"
|
||||
|
||||
|
||||
def generate_metadata_xml(table_name, meta, extent, geoserver_links):
|
||||
|
||||
keywords_xml = "".join([
|
||||
f"""
|
||||
<gmd:keyword><gco:CharacterString>{kw.strip()}</gco:CharacterString></gmd:keyword>
|
||||
""" for kw in meta["keywords"].split(",")
|
||||
])
|
||||
|
||||
geom_type_code = map_geom_type(meta["geom_type"])
|
||||
print('type', geom_type_code)
|
||||
uuid = str(uuid4())
|
||||
|
||||
return f"""
|
||||
<gmd:MD_Metadata
|
||||
xmlns:gmd="http://www.isotc211.org/2005/gmd"
|
||||
xmlns:gco="http://www.isotc211.org/2005/gco"
|
||||
xmlns:srv="http://www.isotc211.org/2005/srv"
|
||||
xmlns:gmx="http://www.isotc211.org/2005/gmx"
|
||||
xmlns:gts="http://www.isotc211.org/2005/gts"
|
||||
xmlns:gsr="http://www.isotc211.org/2005/gsr"
|
||||
xmlns:gmi="http://www.isotc211.org/2005/gmi"
|
||||
xmlns:gml="http://www.opengis.net/gml/3.2"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.isotc211.org/2005/gmd http://schemas.opengis.net/csw/2.0.2/profiles/apiso/1.0.0/apiso.xsd">
|
||||
<gmd:fileIdentifier>
|
||||
<gco:CharacterString>{uuid}</gco:CharacterString>
|
||||
</gmd:fileIdentifier>
|
||||
<gmd:language>
|
||||
<gmd:LanguageCode codeList="http://www.loc.gov/standards/iso639-2/" codeListValue="eng"/>
|
||||
</gmd:language>
|
||||
<gmd:characterSet>
|
||||
<gmd:MD_CharacterSetCode codeListValue="utf8" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_CharacterSetCode"/>
|
||||
</gmd:characterSet>
|
||||
<gmd:hierarchyLevel>
|
||||
<gmd:MD_ScopeCode codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_ScopeCode" codeListValue="feature"/>
|
||||
</gmd:hierarchyLevel>
|
||||
<gmd:contact>
|
||||
<gmd:CI_ResponsibleParty>
|
||||
<gmd:individualName>
|
||||
<gco:CharacterString>{meta['contact_person_name']}</gco:CharacterString>
|
||||
</gmd:individualName>
|
||||
<gmd:organisationName>
|
||||
<gco:CharacterString>{meta['organization_name']}</gco:CharacterString>
|
||||
</gmd:organisationName>
|
||||
<gmd:contactInfo>
|
||||
<gmd:CI_Contact>
|
||||
<gmd:phone>
|
||||
<gmd:CI_Telephone>
|
||||
<gmd:voice>
|
||||
<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
|
||||
</gmd:voice>
|
||||
<gmd:facsimile>
|
||||
<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
|
||||
</gmd:facsimile>
|
||||
</gmd:CI_Telephone>
|
||||
</gmd:phone>
|
||||
<gmd:address>
|
||||
<gmd:CI_Address>
|
||||
<gmd:deliveryPoint>
|
||||
<gco:CharacterString>{meta['organization_address']}</gco:CharacterString>
|
||||
</gmd:deliveryPoint>
|
||||
<gmd:city>
|
||||
<gco:CharacterString>Surabaya</gco:CharacterString>
|
||||
</gmd:city>
|
||||
<gmd:administrativeArea>
|
||||
<gco:CharacterString>Jawa Timur</gco:CharacterString>
|
||||
</gmd:administrativeArea>
|
||||
<gmd:country>
|
||||
<gco:CharacterString>Indonesia</gco:CharacterString>
|
||||
</gmd:country>
|
||||
<gmd:electronicMailAddress>
|
||||
<gco:CharacterString>{meta['organization_email']}</gco:CharacterString>
|
||||
</gmd:electronicMailAddress>
|
||||
</gmd:CI_Address>
|
||||
</gmd:address>
|
||||
<gmd:hoursOfService>
|
||||
<gco:CharacterString>08.00-16.00</gco:CharacterString>
|
||||
</gmd:hoursOfService>
|
||||
</gmd:CI_Contact>
|
||||
</gmd:contactInfo>
|
||||
<gmd:role>
|
||||
<gmd:CI_RoleCode codeListValue="pointOfContact" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#CI_RoleCode"/>
|
||||
</gmd:role>
|
||||
</gmd:CI_ResponsibleParty>
|
||||
</gmd:contact>
|
||||
<gmd:dateStamp>
|
||||
<gco:DateTime>{datetime.utcnow().isoformat()}+07:00</gco:DateTime>
|
||||
</gmd:dateStamp>
|
||||
<gmd:metadataStandardName>
|
||||
<gco:CharacterString>ISO 19115:2003/19139</gco:CharacterString>
|
||||
</gmd:metadataStandardName>
|
||||
<gmd:metadataStandardVersion>
|
||||
<gco:CharacterString>1.0</gco:CharacterString>
|
||||
</gmd:metadataStandardVersion>
|
||||
<gmd:spatialRepresentationInfo>
|
||||
<gmd:MD_VectorSpatialRepresentation>
|
||||
<gmd:geometricObjects>
|
||||
<gmd:MD_GeometricObjects>
|
||||
<gmd:geometricObjectType>
|
||||
<gmd:MD_GeometricObjectTypeCode codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_GeometricObjectTypeCode" codeListValue="{geom_type_code}"/>
|
||||
</gmd:geometricObjectType>
|
||||
<gmd:geometricObjectCount>
|
||||
<gco:Integer>38</gco:Integer>
|
||||
</gmd:geometricObjectCount>
|
||||
</gmd:MD_GeometricObjects>
|
||||
</gmd:geometricObjects>
|
||||
</gmd:MD_VectorSpatialRepresentation>
|
||||
</gmd:spatialRepresentationInfo>
|
||||
<gmd:referenceSystemInfo>
|
||||
<gmd:MD_ReferenceSystem>
|
||||
<gmd:referenceSystemIdentifier>
|
||||
<gmd:RS_Identifier>
|
||||
<gmd:code>
|
||||
<gco:CharacterString>4326</gco:CharacterString>
|
||||
</gmd:code>
|
||||
<gmd:codeSpace>
|
||||
<gco:CharacterString>EPSG</gco:CharacterString>
|
||||
</gmd:codeSpace>
|
||||
</gmd:RS_Identifier>
|
||||
</gmd:referenceSystemIdentifier>
|
||||
</gmd:MD_ReferenceSystem>
|
||||
</gmd:referenceSystemInfo>
|
||||
<gmd:identificationInfo>
|
||||
<gmd:MD_DataIdentification>
|
||||
<gmd:citation>
|
||||
<gmd:CI_Citation>
|
||||
<gmd:title>
|
||||
<gco:CharacterString>{meta['dataset_title']}</gco:CharacterString>
|
||||
</gmd:title>
|
||||
<gmd:date>
|
||||
<gmd:CI_Date>
|
||||
<gmd:date>
|
||||
<gco:DateTime>{meta['created_at'].isoformat()}+07:00</gco:DateTime>
|
||||
</gmd:date>
|
||||
<gmd:dateType>
|
||||
<gmd:CI_DateTypeCode codeListValue="publication" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#CI_DateTypeCode"/>
|
||||
</gmd:dateType>
|
||||
</gmd:CI_Date>
|
||||
</gmd:date>
|
||||
<gmd:edition>
|
||||
<gco:CharacterString>{meta['date_created'].year}</gco:CharacterString>
|
||||
</gmd:edition>
|
||||
<gmd:citedResponsibleParty>
|
||||
<gmd:CI_ResponsibleParty>
|
||||
<gmd:individualName>
|
||||
<gco:CharacterString>{meta['contact_person_name']}</gco:CharacterString>
|
||||
</gmd:individualName>
|
||||
<gmd:organisationName>
|
||||
<gco:CharacterString>{meta['organization_name']}</gco:CharacterString>
|
||||
</gmd:organisationName>
|
||||
<gmd:contactInfo>
|
||||
<gmd:CI_Contact>
|
||||
<gmd:phone>
|
||||
<gmd:CI_Telephone>
|
||||
<gmd:voice>
|
||||
<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
|
||||
</gmd:voice>
|
||||
<gmd:facsimile>
|
||||
<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
|
||||
</gmd:facsimile>
|
||||
</gmd:CI_Telephone>
|
||||
</gmd:phone>
|
||||
<gmd:address>
|
||||
<gmd:CI_Address>
|
||||
<gmd:deliveryPoint>
|
||||
<gco:CharacterString>{meta['organization_address']}</gco:CharacterString>
|
||||
</gmd:deliveryPoint>
|
||||
<gmd:city>
|
||||
<gco:CharacterString>Surabaya</gco:CharacterString>
|
||||
</gmd:city>
|
||||
<gmd:country>
|
||||
<gco:CharacterString>Indonesia</gco:CharacterString>
|
||||
</gmd:country>
|
||||
<gmd:electronicMailAddress>
|
||||
<gco:CharacterString>{meta['organization_email']}</gco:CharacterString>
|
||||
</gmd:electronicMailAddress>
|
||||
</gmd:CI_Address>
|
||||
</gmd:address>
|
||||
<gmd:hoursOfService>
|
||||
<gco:CharacterString>08.00-16.00</gco:CharacterString>
|
||||
</gmd:hoursOfService>
|
||||
</gmd:CI_Contact>
|
||||
</gmd:contactInfo>
|
||||
<gmd:role>
|
||||
<gmd:CI_RoleCode codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#CI_RoleCode" codeListValue="custodian"/>
|
||||
</gmd:role>
|
||||
</gmd:CI_ResponsibleParty>
|
||||
</gmd:citedResponsibleParty>
|
||||
<gmd:otherCitationDetails>
|
||||
<gco:CharacterString>Timezone: UTC+7 (Asia/Jakarta)</gco:CharacterString>
|
||||
</gmd:otherCitationDetails>
|
||||
</gmd:CI_Citation>
|
||||
</gmd:citation>
|
||||
<gmd:abstract>
|
||||
<gco:CharacterString>{meta['dataset_abstract']}</gco:CharacterString>
|
||||
</gmd:abstract>
|
||||
<gmd:purpose>
|
||||
<gco:CharacterString>{meta['dataset_abstract']}</gco:CharacterString>
|
||||
</gmd:purpose>
|
||||
<gmd:status>
|
||||
<gmd:MD_ProgressCode codeListValue="completed" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_ProgressCode"/>
|
||||
</gmd:status>
|
||||
<gmd:pointOfContact>
|
||||
<gmd:CI_ResponsibleParty>
|
||||
<gmd:individualName>
|
||||
<gco:CharacterString>Lab AI Polinema</gco:CharacterString>
|
||||
</gmd:individualName>
|
||||
<gmd:organisationName>
|
||||
<gco:CharacterString>Lab AI Polinema</gco:CharacterString>
|
||||
</gmd:organisationName>
|
||||
<gmd:positionName gco:nilReason="missing"/>
|
||||
<gmd:contactInfo>
|
||||
<gmd:CI_Contact>
|
||||
<gmd:phone>
|
||||
<gmd:CI_Telephone>
|
||||
<gmd:voice>
|
||||
<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
|
||||
</gmd:voice>
|
||||
<gmd:facsimile>
|
||||
<gco:CharacterString>{meta['organization_phone']}</gco:CharacterString>
|
||||
</gmd:facsimile>
|
||||
</gmd:CI_Telephone>
|
||||
</gmd:phone>
|
||||
<gmd:address>
|
||||
<gmd:CI_Address>
|
||||
<gmd:deliveryPoint>
|
||||
<gco:CharacterString>{meta['organization_address']}</gco:CharacterString>
|
||||
</gmd:deliveryPoint>
|
||||
<gmd:city>
|
||||
<gco:CharacterString>Surabaya</gco:CharacterString>
|
||||
</gmd:city>
|
||||
<gmd:administrativeArea>
|
||||
<gco:CharacterString>Jawa Timur</gco:CharacterString>
|
||||
</gmd:administrativeArea>
|
||||
<gmd:country>
|
||||
<gco:CharacterString>Indonesia</gco:CharacterString>
|
||||
</gmd:country>
|
||||
<gmd:electronicMailAddress>
|
||||
<gco:CharacterString>{meta['organization_email']}</gco:CharacterString>
|
||||
</gmd:electronicMailAddress>
|
||||
</gmd:CI_Address>
|
||||
</gmd:address>
|
||||
</gmd:CI_Contact>
|
||||
</gmd:contactInfo>
|
||||
<gmd:role>
|
||||
<gmd:CI_RoleCode codeListValue="owner" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#CI_RoleCode"/>
|
||||
</gmd:role>
|
||||
</gmd:CI_ResponsibleParty>
|
||||
</gmd:pointOfContact>
|
||||
<gmd:resourceMaintenance>
|
||||
<gmd:MD_MaintenanceInformation>
|
||||
<gmd:maintenanceAndUpdateFrequency>
|
||||
<gmd:MD_MaintenanceFrequencyCode codeListValue="annually" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_MaintenanceFrequencyCode"/>
|
||||
</gmd:maintenanceAndUpdateFrequency>
|
||||
</gmd:MD_MaintenanceInformation>
|
||||
</gmd:resourceMaintenance>
|
||||
<gmd:descriptiveKeywords>
|
||||
<gmd:MD_Keywords>
|
||||
{keywords_xml}
|
||||
</gmd:MD_Keywords>
|
||||
</gmd:descriptiveKeywords>
|
||||
<gmd:resourceConstraints>
|
||||
<gmd:MD_LegalConstraints>
|
||||
<gmd:accessConstraints>
|
||||
<gmd:MD_RestrictionCode codeListValue="copyright" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_RestrictionCode"/>
|
||||
</gmd:accessConstraints>
|
||||
<gmd:useConstraints>
|
||||
<gmd:MD_RestrictionCode codeListValue="otherRestrictions" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_RestrictionCode"/>
|
||||
</gmd:useConstraints>
|
||||
<gmd:otherConstraints>
|
||||
<gco:CharacterString>Penggunaan data harus mencantumkan sumber: {meta['organization_name']}.</gco:CharacterString>
|
||||
</gmd:otherConstraints>
|
||||
</gmd:MD_LegalConstraints>
|
||||
</gmd:resourceConstraints>
|
||||
<gmd:spatialRepresentationType>
|
||||
<gmd:MD_SpatialRepresentationTypeCode codeListValue="vector" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_SpatialRepresentationTypeCode"/>
|
||||
</gmd:spatialRepresentationType>
|
||||
<gmd:spatialResolution>
|
||||
<gmd:MD_Resolution>
|
||||
<gmd:equivalentScale>
|
||||
<gmd:MD_RepresentativeFraction>
|
||||
<gmd:denominator>
|
||||
<gco:Integer>25000</gco:Integer>
|
||||
</gmd:denominator>
|
||||
</gmd:MD_RepresentativeFraction>
|
||||
</gmd:equivalentScale>
|
||||
</gmd:MD_Resolution>
|
||||
</gmd:spatialResolution>
|
||||
<gmd:language>
|
||||
<gmd:LanguageCode codeList="http://www.loc.gov/standards/iso639-2/" codeListValue="eng"/>
|
||||
</gmd:language>
|
||||
<gmd:characterSet>
|
||||
<gmd:MD_CharacterSetCode codeListValue="utf8" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_CharacterSetCode"/>
|
||||
</gmd:characterSet>
|
||||
<gmd:extent>
|
||||
<gmd:EX_Extent>
|
||||
<gmd:geographicElement>
|
||||
<gmd:EX_GeographicBoundingBox>
|
||||
<gmd:westBoundLongitude><gco:Decimal>{extent['xmin']}</gco:Decimal></gmd:westBoundLongitude>
|
||||
<gmd:eastBoundLongitude><gco:Decimal>{extent['xmax']}</gco:Decimal></gmd:eastBoundLongitude>
|
||||
<gmd:southBoundLatitude><gco:Decimal>{extent['ymin']}</gco:Decimal></gmd:southBoundLatitude>
|
||||
<gmd:northBoundLatitude><gco:Decimal>{extent['ymax']}</gco:Decimal></gmd:northBoundLatitude>
|
||||
</gmd:EX_GeographicBoundingBox>
|
||||
</gmd:geographicElement>
|
||||
</gmd:EX_Extent>
|
||||
</gmd:extent>
|
||||
</gmd:MD_DataIdentification>
|
||||
</gmd:identificationInfo>
|
||||
<gmd:contentInfo>
|
||||
<gmd:MD_FeatureCatalogueDescription>
|
||||
<gmd:complianceCode>
|
||||
<gco:Boolean>true</gco:Boolean>
|
||||
</gmd:complianceCode>
|
||||
<gmd:includedWithDataset gco:nilReason="unknown"/>
|
||||
<gmd:featureCatalogueCitation>
|
||||
<gmd:CI_Citation>
|
||||
<gmd:title>
|
||||
<gco:CharacterString>{meta['dataset_title']}</gco:CharacterString>
|
||||
</gmd:title>
|
||||
<gmd:date>
|
||||
<gmd:CI_Date>
|
||||
<gmd:date>
|
||||
<gco:DateTime>{meta['created_at'].isoformat()}+07:00</gco:DateTime>
|
||||
</gmd:date>
|
||||
<gmd:dateType>
|
||||
<gmd:CI_DateTypeCode codeListValue="publication" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#CI_DateTypeCode"/>
|
||||
</gmd:dateType>
|
||||
</gmd:CI_Date>
|
||||
</gmd:date>
|
||||
<gmd:edition>
|
||||
<gco:CharacterString>{meta['date_created'].year}</gco:CharacterString>
|
||||
</gmd:edition>
|
||||
</gmd:CI_Citation>
|
||||
</gmd:featureCatalogueCitation>
|
||||
</gmd:MD_FeatureCatalogueDescription>
|
||||
</gmd:contentInfo>
|
||||
<gmd:distributionInfo>
|
||||
<gmd:MD_Distribution>
|
||||
<gmd:transferOptions>
|
||||
<gmd:MD_DigitalTransferOptions>
|
||||
<gmd:onLine>
|
||||
<gmd:CI_OnlineResource>
|
||||
<gmd:linkage>
|
||||
<gmd:URL>{geoserver_links["wms_url"]}</gmd:URL>
|
||||
</gmd:linkage>
|
||||
<gmd:protocol>
|
||||
<gco:CharacterString>DB:POSTGIS</gco:CharacterString>
|
||||
</gmd:protocol>
|
||||
<gmd:name>
|
||||
<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
|
||||
</gmd:name>
|
||||
<gmd:description>
|
||||
<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
|
||||
</gmd:description>
|
||||
</gmd:CI_OnlineResource>
|
||||
</gmd:onLine>
|
||||
<gmd:onLine>
|
||||
<gmd:CI_OnlineResource>
|
||||
<gmd:linkage>
|
||||
<gmd:URL>{geoserver_links["wms_url"]}</gmd:URL>
|
||||
</gmd:linkage>
|
||||
<gmd:protocol>
|
||||
<gco:CharacterString>WWW:LINK-1.0-http--link</gco:CharacterString>
|
||||
</gmd:protocol>
|
||||
<gmd:name>
|
||||
<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
|
||||
</gmd:name>
|
||||
<gmd:description>
|
||||
<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
|
||||
</gmd:description>
|
||||
</gmd:CI_OnlineResource>
|
||||
</gmd:onLine>
|
||||
<gmd:onLine>
|
||||
<gmd:CI_OnlineResource>
|
||||
<gmd:linkage>
|
||||
<gmd:URL>{geoserver_links["wms_url"]}</gmd:URL>
|
||||
</gmd:linkage>
|
||||
<gmd:protocol>
|
||||
<gco:CharacterString>OGC:WMS</gco:CharacterString>
|
||||
</gmd:protocol>
|
||||
<gmd:name>
|
||||
<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
|
||||
</gmd:name>
|
||||
</gmd:CI_OnlineResource>
|
||||
</gmd:onLine>
|
||||
|
||||
<gmd:onLine>
|
||||
<gmd:CI_OnlineResource>
|
||||
<gmd:linkage>
|
||||
<gmd:URL>{geoserver_links["wfs_url"]}</gmd:URL>
|
||||
</gmd:linkage>
|
||||
<gmd:protocol>
|
||||
<gco:CharacterString>OGC:WFS</gco:CharacterString>
|
||||
</gmd:protocol>
|
||||
<gmd:name>
|
||||
<gco:CharacterString>{meta["dataset_title"]}</gco:CharacterString>
|
||||
</gmd:name>
|
||||
</gmd:CI_OnlineResource>
|
||||
</gmd:onLine>
|
||||
</gmd:MD_DigitalTransferOptions>
|
||||
</gmd:transferOptions>
|
||||
</gmd:MD_Distribution>
|
||||
</gmd:distributionInfo>
|
||||
<gmd:dataQualityInfo>
|
||||
<gmd:DQ_DataQuality>
|
||||
<gmd:scope>
|
||||
<gmd:DQ_Scope>
|
||||
<gmd:level>
|
||||
<gmd:MD_ScopeCode codeListValue="dataset" codeList="http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_ScopeCode"/>
|
||||
</gmd:level>
|
||||
</gmd:DQ_Scope>
|
||||
</gmd:scope>
|
||||
<gmd:lineage>
|
||||
<gmd:LI_Lineage>
|
||||
<gmd:statement>
|
||||
<gco:CharacterString>Data dihasilkan dari digitasi peta dasar skala 1:25000 menggunakan QGIS.</gco:CharacterString>
|
||||
</gmd:statement>
|
||||
</gmd:LI_Lineage>
|
||||
</gmd:lineage>
|
||||
</gmd:DQ_DataQuality>
|
||||
</gmd:dataQualityInfo>
|
||||
</gmd:MD_Metadata>
|
||||
"""
|
||||
|
||||
|
||||
# Geonetwork version 4.4.9.0
|
||||
def upload_metadata_to_geonetwork(xml_metadata: str):
|
||||
# session = requests.Session()
|
||||
# session.auth = (GEONETWORK_USER, GEONETWORK_PASS)
|
||||
|
||||
# # 1. Get XSRF token
|
||||
# try:
|
||||
# info_url = f"{GEONETWORK_URL}/srv/eng/info?type=me"
|
||||
# session.get(info_url)
|
||||
# except requests.exceptions.RequestException as e:
|
||||
# raise HTTPException(status_code=503, detail=f"Failed to connect to GeoNetwork: {e}")
|
||||
|
||||
# xsrf_token = session.cookies.get('XSRF-TOKEN')
|
||||
# if not xsrf_token:
|
||||
# raise HTTPException(status_code=500, detail="Could not retrieve XSRF-TOKEN from GeoNetwork.")
|
||||
|
||||
session, xsrf_token = create_gn_session()
|
||||
headers = {
|
||||
'X-XSRF-TOKEN': xsrf_token,
|
||||
'Accept': 'application/json'
|
||||
}
|
||||
|
||||
GN_API_RECORDS_URL = f"{GEONETWORK_URL}/srv/api/records"
|
||||
|
||||
# 2. GeoNetwork requires a multipart/form-data upload
|
||||
files = {
|
||||
'file': ('metadata.xml', xml_metadata, 'application/xml')
|
||||
}
|
||||
|
||||
params = {
|
||||
"ownerGroup": 1, # all
|
||||
"ownerUser": 1 # admin
|
||||
}
|
||||
|
||||
response = session.post(
|
||||
GN_API_RECORDS_URL,
|
||||
params=params,
|
||||
files=files,
|
||||
headers=headers,
|
||||
cookies=session.cookies.get_dict()
|
||||
)
|
||||
|
||||
metadata_infos = response.json().get("metadataInfos", {})
|
||||
uuid = None
|
||||
for records in metadata_infos.values():
|
||||
if records and isinstance(records, list):
|
||||
uuid = records[0].get("uuid")
|
||||
break
|
||||
if not uuid:
|
||||
raise ValueError("UUID not found in GeoNetwork response")
|
||||
|
||||
publish_record(session, uuid)
|
||||
|
||||
# print("response", response.json())
|
||||
return uuid
|
||||
|
||||
|
||||
|
||||
async def publish_metadata(table_name: str, geoserver_links: dict):
|
||||
|
||||
extent = await get_extent(table_name)
|
||||
meta = await get_author_metadata(table_name)
|
||||
xml = generate_metadata_xml(
|
||||
table_name=meta["dataset_title"],
|
||||
meta=meta,
|
||||
extent=extent,
|
||||
geoserver_links=geoserver_links
|
||||
)
|
||||
|
||||
xml_clean = fix_xml_urls(xml)
|
||||
uuid = upload_metadata_to_geonetwork(xml_clean)
|
||||
|
||||
print(f"[GeoNetwork] Metadata uploaded. UUID = {uuid}")
|
||||
|
||||
return uuid
|
||||
|
||||
|
||||
|
||||
def publish_record(session, uuid):
|
||||
print('[uuid]', uuid)
|
||||
xsrf_token = session.cookies.get('XSRF-TOKEN')
|
||||
|
||||
headers = {
|
||||
"X-XSRF-TOKEN": xsrf_token,
|
||||
"Accept": "application/json",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
url = f"{GEONETWORK_URL}/srv/api/records/{uuid}/sharing"
|
||||
|
||||
payload = {
|
||||
"clear": True,
|
||||
"privileges": [
|
||||
{
|
||||
"group": 1,
|
||||
"operations": {
|
||||
"view": True
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
response = session.put(url, json=payload, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
|
||||
# single stand func
|
||||
# def publish_record(uuid):
|
||||
# session, xsrf_token = create_gn_session()
|
||||
|
||||
# headers = {
|
||||
# "X-XSRF-TOKEN": xsrf_token,
|
||||
# "Content-Type": "application/json"
|
||||
# }
|
||||
|
||||
# url = f"{GEONETWORK_URL}/srv/api/records/{uuid}/sharing"
|
||||
|
||||
# payload = {
|
||||
# "clear": True,
|
||||
# "privileges": [
|
||||
# {"group": 1, "operations": {"view": True}}
|
||||
# ]
|
||||
# }
|
||||
|
||||
# resp = session.put(url, json=payload, headers=headers)
|
||||
# resp.raise_for_status()
|
||||
300
app/mapset_pipeline/core/publication/publish_geoserver.py
Executable file
300
app/mapset_pipeline/core/publication/publish_geoserver.py
Executable file
|
|
@ -0,0 +1,300 @@
|
|||
import requests
|
||||
import json
|
||||
import os
|
||||
from app.core.config import GEOSERVER_URL, GEOSERVER_USER, GEOSERVER_PASS, GEOSERVER_WORKSPACE
|
||||
|
||||
# DATASTORE = "postgis" #per OPD
|
||||
DATASTORE = "server_lokal"
|
||||
# SLD_DIR = "./styles"
|
||||
|
||||
# BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
# SLD_DIR = os.path.join(BASE_DIR, "styles")
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
MAIN_DIR = os.path.abspath(os.path.join(BASE_DIR, "..", ".."))
|
||||
SLD_DIR = os.path.join(MAIN_DIR, "style_temp")
|
||||
|
||||
|
||||
def publish_layer_to_geoserver(table: str, job_id: str):
|
||||
print(f"[GeoServer] Publish layer + upload SLD: {table}")
|
||||
|
||||
# ==========================
|
||||
# 1. Publish Feature Type
|
||||
# ==========================
|
||||
# ft_url = f"{GEOSERVER_URL}/rest/workspaces/{GEOSERVER_WORKSPACE}/datastores/{DATASTORE}/featuretypes"
|
||||
ft_url = f"{GEOSERVER_URL}/rest/workspaces/{GEOSERVER_WORKSPACE}/datastores/{DATASTORE}/featuretypes?computeDefault=true"
|
||||
|
||||
payload = {
|
||||
"featureType": {
|
||||
"name": table,
|
||||
"nativeName": table,
|
||||
"enabled": True
|
||||
}
|
||||
}
|
||||
|
||||
requests.post(
|
||||
ft_url,
|
||||
auth=(GEOSERVER_USER, GEOSERVER_PASS),
|
||||
headers={"Content-Type": "application/json"},
|
||||
data=json.dumps(payload)
|
||||
)
|
||||
|
||||
print(f"[GeoServer] FeatureType published for: {table}")
|
||||
|
||||
# ==========================================
|
||||
# 2. Upload SLD file to GeoServer
|
||||
# ==========================================
|
||||
|
||||
sld_file = f"{SLD_DIR}/{job_id}.sld"
|
||||
style_name = table # style name sama dengan table
|
||||
|
||||
if not os.path.exists(sld_file):
|
||||
print(f"[WARNING] SLD file tidak ditemukan: {sld_file}")
|
||||
else:
|
||||
print(f"[GeoServer] Upload SLD {sld_file}")
|
||||
|
||||
#old
|
||||
# style_url = f"{GEOSERVER_URL}/rest/styles"
|
||||
|
||||
# with open(sld_file, "rb") as sld:
|
||||
# requests.post(
|
||||
# f"{style_url}?name={style_name}&workspace={GEOSERVER_WORKSPACE}",
|
||||
# auth=(GEOSERVER_USER, GEOSERVER_PASS),
|
||||
# headers={"Content-Type": "application/vnd.ogc.sld+xml"},
|
||||
# data=sld.read()
|
||||
# )
|
||||
|
||||
# print(f"[GeoServer] SLD uploaded: {style_name}")
|
||||
|
||||
|
||||
|
||||
#new
|
||||
style_url = (
|
||||
f"{GEOSERVER_URL}/rest/workspaces/"
|
||||
f"{GEOSERVER_WORKSPACE}/styles"
|
||||
)
|
||||
|
||||
with open(sld_file, "r", encoding="utf-8") as f:
|
||||
sld_content = f.read()
|
||||
|
||||
# 🔥 INI BARIS PENTINGNYA
|
||||
sld_content = sld_content.lstrip("\ufeff \t\r\n")
|
||||
|
||||
resp = requests.post(
|
||||
f"{style_url}?name={style_name}",
|
||||
auth=(GEOSERVER_USER, GEOSERVER_PASS),
|
||||
headers={"Content-Type": "application/vnd.ogc.sld+xml"},
|
||||
data=sld_content.encode("utf-8")
|
||||
)
|
||||
|
||||
|
||||
if resp.status_code not in (200, 201):
|
||||
raise Exception(
|
||||
f"Upload SLD gagal ({resp.status_code}): {resp.text}"
|
||||
)
|
||||
|
||||
print(f"[GeoServer] SLD uploaded: {style_name}")
|
||||
|
||||
|
||||
|
||||
|
||||
# ==========================================
|
||||
# 3. Apply SLD to the layer
|
||||
# ==========================================
|
||||
|
||||
layer_url = f"{GEOSERVER_URL}/rest/layers/{GEOSERVER_WORKSPACE}:{table}"
|
||||
|
||||
payload = {
|
||||
"layer": {
|
||||
"defaultStyle": {
|
||||
"name": style_name,
|
||||
"workspace": GEOSERVER_WORKSPACE
|
||||
},
|
||||
"enabled": True
|
||||
}
|
||||
}
|
||||
|
||||
requests.put(
|
||||
layer_url,
|
||||
auth=(GEOSERVER_USER, GEOSERVER_PASS),
|
||||
headers={"Content-Type": "application/json"},
|
||||
data=json.dumps(payload)
|
||||
)
|
||||
|
||||
print(f"[GeoServer] SLD applied as default style for {table}")
|
||||
|
||||
# ==========================================
|
||||
# 4. Delete SLD file from local folder
|
||||
# ==========================================
|
||||
|
||||
os.remove(sld_file)
|
||||
print(f"[CLEANUP] SLD file removed: {sld_file}")
|
||||
|
||||
# ==============================================
|
||||
# 5. Reload GeoServer (optional but recommended)
|
||||
# ==============================================
|
||||
requests.post(
|
||||
f"{GEOSERVER_URL}/rest/reload",
|
||||
auth=(GEOSERVER_USER, GEOSERVER_PASS)
|
||||
)
|
||||
|
||||
# ====================================================
|
||||
# 7. Generate GeoServer WMS/WFS link untuk GeoNetwork
|
||||
# ====================================================
|
||||
|
||||
wms_link = (
|
||||
f"{GEOSERVER_URL}/{GEOSERVER_WORKSPACE}/wms?"
|
||||
f"service=WMS&request=GetMap&layers={GEOSERVER_WORKSPACE}:{table}"
|
||||
)
|
||||
wfs_link = (
|
||||
f"{GEOSERVER_URL}/{GEOSERVER_WORKSPACE}/wfs?"
|
||||
f"service=WFS&request=GetFeature&typeName={GEOSERVER_WORKSPACE}:{table}"
|
||||
)
|
||||
# print(f"[GeoServer] WMS URL: {wms_link}")
|
||||
# print(f"[GeoServer] WFS URL: {wfs_link}")
|
||||
# print(f"[GeoServer] Reload completed. Layer {table} ready.")
|
||||
openlayer_url = (
|
||||
f"{GEOSERVER_URL}/{GEOSERVER_WORKSPACE}/wms?"
|
||||
f"service=WMS"
|
||||
f"&version=1.1.0"
|
||||
f"&request=GetMap"
|
||||
f"&layers={GEOSERVER_WORKSPACE}:{table}"
|
||||
f"&styles="
|
||||
f"&bbox=110.89528623700005%2C-8.780412043999945%2C116.26994997700001%2C-5.042971664999925"
|
||||
f"&width=768"
|
||||
f"&height=384"
|
||||
f"&srs=EPSG:4326"
|
||||
f"&format=application/openlayers"
|
||||
)
|
||||
|
||||
return {
|
||||
"table": table,
|
||||
"style": style_name,
|
||||
"wms_url": wms_link,
|
||||
"wfs_url": wfs_link,
|
||||
"layer_url": openlayer_url
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# use default style
|
||||
# def publish_layer_to_geoserver(table: str):
|
||||
|
||||
# print(f"[GeoServer] Publish layer: {table}")
|
||||
|
||||
# # ========== 1. Publish Feature Type ==========
|
||||
# ft_url = f"{GEOSERVER_URL}/rest/workspaces/{WORKSPACE}/datastores/{DATASTORE}/featuretypes"
|
||||
|
||||
# payload = {
|
||||
# "featureType": {
|
||||
# "name": table,
|
||||
# "nativeName": table,
|
||||
# "enabled": True
|
||||
# }
|
||||
# }
|
||||
|
||||
# requests.post(
|
||||
# ft_url,
|
||||
# auth=(GEOSERVER_USER, GEOSERVER_PASS),
|
||||
# headers={"Content-Type": "application/json"},
|
||||
# data=json.dumps(payload)
|
||||
# )
|
||||
|
||||
# # ===================================================
|
||||
# # 2. Tentukan SLD file (prioritas table.sld → fallback default)
|
||||
# # ===================================================
|
||||
# table_sld = SLD_DIR / f"{table}.sld"
|
||||
# default_sld = SLD_DIR / "default_style.sld"
|
||||
|
||||
# if table_sld.exists():
|
||||
# chosen_sld = table_sld
|
||||
# delete_after = True
|
||||
# style_name = table # pakai nama style sama dengan layer
|
||||
# print(f"[SLD] Menggunakan SLD khusus: {chosen_sld}")
|
||||
# else:
|
||||
# chosen_sld = default_sld
|
||||
# delete_after = False
|
||||
# style_name = "default_style"
|
||||
# print(f"[SLD] Menggunakan default SLD: {chosen_sld}")
|
||||
|
||||
# # ==========================================
|
||||
# # 3. Upload SLD
|
||||
# # ==========================================
|
||||
# style_url = f"{GEOSERVER_URL}/rest/styles"
|
||||
|
||||
# with open(chosen_sld, "rb") as sld:
|
||||
# requests.post(
|
||||
# f"{style_url}?name={style_name}&workspace={WORKSPACE}",
|
||||
# auth=(GEOSERVER_USER, GEOSERVER_PASS),
|
||||
# headers={"Content-Type": "application/vnd.ogc.sld+xml"},
|
||||
# data=sld.read()
|
||||
# )
|
||||
|
||||
# print(f"[GeoServer] SLD uploaded: {style_name}")
|
||||
|
||||
# # ==========================================
|
||||
# # 4. Apply SLD ke layer
|
||||
# # ==========================================
|
||||
# layer_url = f"{GEOSERVER_URL}/rest/layers/{WORKSPACE}:{table}"
|
||||
|
||||
# payload = {
|
||||
# "layer": {
|
||||
# "defaultStyle": {
|
||||
# "name": style_name,
|
||||
# "workspace": WORKSPACE
|
||||
# },
|
||||
# "enabled": True
|
||||
# }
|
||||
# }
|
||||
|
||||
# requests.put(
|
||||
# layer_url,
|
||||
# auth=(GEOSERVER_USER, GEOSERVER_PASS),
|
||||
# headers={"Content-Type": "application/json"},
|
||||
# data=json.dumps(payload)
|
||||
# )
|
||||
|
||||
# print(f"[GeoServer] Style '{style_name}' applied to layer '{table}'")
|
||||
|
||||
# # ==========================================
|
||||
# # 5. Delete table.sld jika ada
|
||||
# # ==========================================
|
||||
# if delete_after:
|
||||
# table_sld.unlink()
|
||||
# print(f"[CLEANUP] File SLD '{table}.sld' dihapus")
|
||||
|
||||
# # ====================================================
|
||||
# # 6. Reload GeoServer (opsional tapi aman)
|
||||
# # ====================================================
|
||||
# requests.post(
|
||||
# f"{GEOSERVER_URL}/rest/reload",
|
||||
# auth=(GEOSERVER_USER, GEOSERVER_PASS)
|
||||
# )
|
||||
|
||||
# # ====================================================
|
||||
# # 7. Generate GeoServer WMS/WFS link untuk GeoNetwork
|
||||
# # ====================================================
|
||||
|
||||
# wms_link = (
|
||||
# f"{GEOSERVER_URL}/{WORKSPACE}/wms?"
|
||||
# f"service=WMS&request=GetMap&layers={WORKSPACE}:{table}"
|
||||
# )
|
||||
|
||||
# wfs_link = (
|
||||
# f"{GEOSERVER_URL}/{WORKSPACE}/wfs?"
|
||||
# f"service=WFS&request=GetFeature&typeName={WORKSPACE}:{table}"
|
||||
# )
|
||||
|
||||
# print(f"[GeoServer] WMS URL: {wms_link}")
|
||||
# print(f"[GeoServer] WFS URL: {wfs_link}")
|
||||
|
||||
# return {
|
||||
# "table": table,
|
||||
# "style": style_name,
|
||||
# "wms_url": wms_link,
|
||||
# "wfs_url": wfs_link
|
||||
# }
|
||||
|
||||
|
||||
18
app/mapset_pipeline/core/readers/__init__.py
Executable file
18
app/mapset_pipeline/core/readers/__init__.py
Executable file
|
|
@ -0,0 +1,18 @@
|
|||
# Import fungsi utama dari masing-masing file reader
|
||||
# (Titik '.' berarti import dari folder yang sama)
|
||||
|
||||
from .reader_csv import read_csv
|
||||
from .reader_shp import read_shp
|
||||
from .reader_gdb import read_gdb
|
||||
from .reader_mpk import read_mpk
|
||||
from .reader_pdf import read_pdf, convert_df
|
||||
|
||||
# Opsional: Mendefinisikan apa yang akan ter-import jika orang mengetik "from ... import *"
|
||||
__all__ = [
|
||||
"read_csv",
|
||||
"read_shp",
|
||||
"read_gdb",
|
||||
"read_mpk",
|
||||
"read_pdf",
|
||||
"convert_df"
|
||||
]
|
||||
228
app/mapset_pipeline/core/readers/reader_csv.py
Executable file
228
app/mapset_pipeline/core/readers/reader_csv.py
Executable file
|
|
@ -0,0 +1,228 @@
|
|||
import pandas as pd
|
||||
import re
|
||||
import csv
|
||||
import os
|
||||
|
||||
def detect_header_line(path, max_rows=10):
|
||||
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
lines = [next(f) for _ in range(max_rows)]
|
||||
header_line_idx = 0
|
||||
best_score = -1
|
||||
for i, line in enumerate(lines):
|
||||
cells = re.split(r'[;,|\t]', line.strip())
|
||||
alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
|
||||
digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
|
||||
score = alpha_ratio - digit_ratio
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
header_line_idx = i
|
||||
return header_line_idx
|
||||
|
||||
def detect_delimiter(path, sample_size=2048):
|
||||
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
sample = f.read(sample_size)
|
||||
sniffer = csv.Sniffer()
|
||||
try:
|
||||
dialect = sniffer.sniff(sample)
|
||||
return dialect.delimiter
|
||||
except Exception:
|
||||
for delim in [',', ';', '\t', '|']:
|
||||
if delim in sample:
|
||||
return delim
|
||||
return ','
|
||||
|
||||
|
||||
# def read_csv(path: str, sheet: str = None):
|
||||
# ext = os.path.splitext(path)[1].lower()
|
||||
|
||||
# try:
|
||||
# if ext in ['.csv']:
|
||||
# header_line = detect_header_line(path)
|
||||
# delimiter = detect_delimiter(path)
|
||||
# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
|
||||
|
||||
# df = pd.read_csv(
|
||||
# path,
|
||||
# header=header_line,
|
||||
# sep=delimiter,
|
||||
# encoding='utf-8',
|
||||
# low_memory=False,
|
||||
# thousands=','
|
||||
# )
|
||||
|
||||
# elif ext in ['.xlsx', '.xls']:
|
||||
# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
|
||||
# xls = pd.ExcelFile(path)
|
||||
# print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
|
||||
|
||||
# if sheet:
|
||||
# if sheet not in xls.sheet_names:
|
||||
# raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}")
|
||||
# print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
|
||||
# df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str)
|
||||
# df = df.dropna(how='all').dropna(axis=1, how='all')
|
||||
|
||||
# else:
|
||||
# print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...")
|
||||
# best_sheet = None
|
||||
# best_score = -1
|
||||
# best_df = None
|
||||
|
||||
# for sheet_name in xls.sheet_names:
|
||||
# try:
|
||||
# temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
|
||||
# temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all')
|
||||
|
||||
# if len(temp_df) == 0 or len(temp_df.columns) < 2:
|
||||
# continue
|
||||
|
||||
# # hitung skor relevansi
|
||||
# text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1)
|
||||
# row_score = len(temp_df)
|
||||
# score = (row_score * 0.7) + (text_ratio * 100)
|
||||
|
||||
# if score > best_score:
|
||||
# best_score = score
|
||||
# best_sheet = sheet_name
|
||||
# best_df = temp_df
|
||||
|
||||
# except Exception as e:
|
||||
# print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
|
||||
# continue
|
||||
|
||||
# if best_df is not None:
|
||||
# print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
|
||||
# df = best_df
|
||||
# else:
|
||||
# raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
|
||||
|
||||
# for col in df.columns:
|
||||
# if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
|
||||
# df[col] = df[col].astype(str).str.replace(',', '', regex=False)
|
||||
# df[col] = pd.to_numeric(df[col], errors='ignore')
|
||||
|
||||
# else:
|
||||
# raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
|
||||
|
||||
# except Exception as e:
|
||||
# print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
|
||||
# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
|
||||
|
||||
# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
|
||||
# df.columns = [str(c).strip() for c in df.columns]
|
||||
# df = df.dropna(how='all')
|
||||
|
||||
# return df
|
||||
|
||||
|
||||
|
||||
def read_csv(path: str, sheet: str = None):
|
||||
ext = os.path.splitext(path)[1].lower()
|
||||
df = pd.DataFrame() # Inisialisasi default
|
||||
|
||||
try:
|
||||
# --- BLOK PEMBACAAN FILE ---
|
||||
if ext in ['.csv']:
|
||||
header_line = detect_header_line(path)
|
||||
delimiter = detect_delimiter(path)
|
||||
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
|
||||
|
||||
df = pd.read_csv(
|
||||
path,
|
||||
header=header_line,
|
||||
sep=delimiter,
|
||||
encoding='utf-8',
|
||||
low_memory=False,
|
||||
thousands=','
|
||||
)
|
||||
|
||||
elif ext in ['.xlsx', '.xls']:
|
||||
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
|
||||
xls = pd.ExcelFile(path, engine='openpyxl') # Pakai engine openpyxl
|
||||
print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
|
||||
|
||||
if sheet:
|
||||
if sheet not in xls.sheet_names:
|
||||
raise ValueError(f"Sheet '{sheet}' tidak ditemukan.")
|
||||
print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
|
||||
# Tambahkan engine='openpyxl'
|
||||
df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str, engine='openpyxl')
|
||||
df = df.dropna(how='all').dropna(axis=1, how='all')
|
||||
|
||||
else:
|
||||
# Logika pencarian sheet terbaik (tidak berubah, hanya indentasi)
|
||||
print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...")
|
||||
best_sheet = None
|
||||
best_score = -1
|
||||
best_df = None
|
||||
|
||||
for sheet_name in xls.sheet_names:
|
||||
try:
|
||||
temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str, engine='openpyxl')
|
||||
temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all')
|
||||
|
||||
if len(temp_df) == 0 or len(temp_df.columns) < 2:
|
||||
continue
|
||||
|
||||
text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1)
|
||||
row_score = len(temp_df)
|
||||
score = (row_score * 0.7) + (text_ratio * 100)
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_sheet = sheet_name
|
||||
best_df = temp_df
|
||||
except Exception as e:
|
||||
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
|
||||
continue
|
||||
|
||||
if best_df is not None:
|
||||
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
|
||||
df = best_df
|
||||
else:
|
||||
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
|
||||
|
||||
else:
|
||||
raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
|
||||
|
||||
# --- BLOK PEMBERSIHAN (Dilakukan setelah file sukses terbaca) ---
|
||||
# Kita bungkus ini agar error konversi angka TIDAK menggagalkan pembacaan file
|
||||
if not df.empty:
|
||||
df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
|
||||
df.columns = [str(c).strip() for c in df.columns]
|
||||
df = df.dropna(how='all')
|
||||
|
||||
# Konversi Angka yang Lebih Aman
|
||||
for col in df.columns:
|
||||
try:
|
||||
# Cek apakah kolom terlihat seperti angka
|
||||
if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
|
||||
# Bersihkan koma
|
||||
clean_col = df[col].astype(str).str.replace(',', '', regex=False)
|
||||
# Gunakan errors='coerce' agar jika ada error value (NaN/REF), dia jadi NaN, bukan crash
|
||||
df[col] = pd.to_numeric(clean_col, errors='coerce')
|
||||
except Exception as ex:
|
||||
# Jika konversi gagal, biarkan sebagai string/object dan lanjut ke kolom berikutnya
|
||||
print(f"[WARN] Gagal konversi numerik pada kolom '{col}': {ex}")
|
||||
pass
|
||||
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
# --- ERROR HANDLING YANG BENAR ---
|
||||
print(f"[WARN] Gagal membaca file utama ({e}).")
|
||||
|
||||
# Hanya lakukan fallback CSV jika file aslinya MEMANG CSV (atau txt)
|
||||
# Jangan paksa baca .xlsx pakai read_csv
|
||||
if ext in ['.csv', '.txt']:
|
||||
print("[INFO] Mencoba fallback ke default CSV reader...")
|
||||
try:
|
||||
return pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
|
||||
except Exception as e2:
|
||||
print(f"[ERROR] Fallback CSV juga gagal: {e2}")
|
||||
|
||||
# Jika file Excel gagal dibaca, return DataFrame kosong atau raise error
|
||||
print("[ERROR] Tidak dapat memulihkan pembacaan file Excel.")
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
75
app/mapset_pipeline/core/readers/reader_gdb.py
Executable file
75
app/mapset_pipeline/core/readers/reader_gdb.py
Executable file
|
|
@ -0,0 +1,75 @@
|
|||
import geopandas as gpd
|
||||
import fiona
|
||||
import zipfile
|
||||
import tempfile
|
||||
import os
|
||||
import shutil
|
||||
|
||||
def read_gdb(zip_path: str, layer: str = None):
|
||||
if not zip_path.lower().endswith(".zip"):
|
||||
raise ValueError("File GDB harus berupa ZIP yang berisi folder .gdb atau file .gdbtable")
|
||||
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||
zip_ref.extractall(tmpdir)
|
||||
|
||||
macosx_path = os.path.join(tmpdir, "__MACOSX")
|
||||
if os.path.exists(macosx_path):
|
||||
shutil.rmtree(macosx_path)
|
||||
|
||||
gdb_folders = []
|
||||
for root, dirs, _ in os.walk(tmpdir):
|
||||
for d in dirs:
|
||||
if d.lower().endswith(".gdb"):
|
||||
gdb_folders.append(os.path.join(root, d))
|
||||
|
||||
if not gdb_folders:
|
||||
gdbtable_files = []
|
||||
for root, _, files in os.walk(tmpdir):
|
||||
for f in files:
|
||||
if f.lower().endswith(".gdbtable"):
|
||||
gdbtable_files.append(os.path.join(root, f))
|
||||
|
||||
if gdbtable_files:
|
||||
first_folder = os.path.dirname(gdbtable_files[0])
|
||||
base_name = os.path.basename(first_folder)
|
||||
gdb_folder_path = os.path.join(tmpdir, f"{base_name}.gdb")
|
||||
|
||||
os.makedirs(gdb_folder_path, exist_ok=True)
|
||||
|
||||
for fpath in os.listdir(first_folder):
|
||||
if ".gdb" in fpath.lower():
|
||||
shutil.move(os.path.join(first_folder, fpath), os.path.join(gdb_folder_path, fpath))
|
||||
|
||||
gdb_folders.append(gdb_folder_path)
|
||||
# print(f"[INFO] Rebuilt GDB folder from nested structure: {gdb_folder_path}")
|
||||
else:
|
||||
# print("[DEBUG] Isi ZIP:", os.listdir(tmpdir))
|
||||
shutil.rmtree(tmpdir)
|
||||
raise ValueError("Tidak ditemukan folder .gdb atau file .gdbtable di dalam ZIP")
|
||||
|
||||
gdb_path = gdb_folders[0]
|
||||
|
||||
layers = fiona.listlayers(gdb_path)
|
||||
# print(f"[INFO] Layer tersedia: {layers}")
|
||||
|
||||
chosen_layer = layer or (layers[0] if layers else None)
|
||||
if not chosen_layer:
|
||||
shutil.rmtree(tmpdir)
|
||||
raise ValueError("Tidak ada layer GDB yang bisa dibaca.")
|
||||
|
||||
print(f"[DEBUG] Membaca layer: {chosen_layer}")
|
||||
|
||||
try:
|
||||
gdf = gpd.read_file(gdb_path, layer=chosen_layer)
|
||||
except Exception as e:
|
||||
shutil.rmtree(tmpdir)
|
||||
raise ValueError(f"Gagal membaca layer dari GDB: {e}")
|
||||
|
||||
if gdf.crs is None:
|
||||
# print("[WARN] CRS tidak terdeteksi, diasumsikan EPSG:4326")
|
||||
gdf.set_crs("EPSG:4326", inplace=True)
|
||||
|
||||
|
||||
shutil.rmtree(tmpdir)
|
||||
return gdf
|
||||
72
app/mapset_pipeline/core/readers/reader_mpk.py
Executable file
72
app/mapset_pipeline/core/readers/reader_mpk.py
Executable file
|
|
@ -0,0 +1,72 @@
|
|||
import os
|
||||
import tempfile
|
||||
import json
|
||||
from io import BytesIO
|
||||
import geopandas as gpd
|
||||
from py7zr import SevenZipFile
|
||||
import pyogrio
|
||||
|
||||
|
||||
def find_data_source(extract_dir: str):
|
||||
"""
|
||||
Cari data sumber (.gdb atau .shp) di dalam folder hasil ekstrak.
|
||||
"""
|
||||
for root, dirs, _ in os.walk(extract_dir):
|
||||
for d in dirs:
|
||||
if d.lower().endswith(".gdb"):
|
||||
return os.path.join(root, d)
|
||||
|
||||
for root, _, files in os.walk(extract_dir):
|
||||
for f in files:
|
||||
if f.lower().endswith(".shp"):
|
||||
return os.path.join(root, f)
|
||||
|
||||
raise ValueError("Tidak ditemukan data source yang didukung (.gdb atau .shp).")
|
||||
|
||||
|
||||
def get_main_layer(gdb_path: str):
|
||||
"""
|
||||
Ambil nama layer utama dari geodatabase (.gdb).
|
||||
"""
|
||||
try:
|
||||
layers = pyogrio.list_layers(gdb_path)
|
||||
for layer in layers:
|
||||
if not layer[0].lower().endswith("__attach"):
|
||||
return layer[0]
|
||||
if layers:
|
||||
return layers[0][0]
|
||||
raise ValueError(f"Tidak ada layer utama yang valid di {gdb_path}")
|
||||
except Exception as e:
|
||||
raise ValueError(f"Gagal membaca daftar layer GDB: {e}")
|
||||
|
||||
|
||||
def read_mpk(path: str):
|
||||
mpk_bytes = None
|
||||
with open(path, "rb") as f:
|
||||
mpk_bytes = f.read()
|
||||
|
||||
if not mpk_bytes:
|
||||
raise ValueError("File MPK kosong atau tidak valid.")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tempdir:
|
||||
try:
|
||||
with SevenZipFile(BytesIO(mpk_bytes), mode="r") as z:
|
||||
z.extractall(path=tempdir)
|
||||
except Exception as e:
|
||||
raise ValueError(f"File MPK rusak atau tidak valid: {e}")
|
||||
|
||||
src_path = find_data_source(tempdir)
|
||||
|
||||
if src_path.lower().endswith(".gdb"):
|
||||
layer_name = get_main_layer(src_path)
|
||||
gdf = gpd.read_file(src_path, layer=layer_name)
|
||||
else:
|
||||
gdf = gpd.read_file(src_path)
|
||||
|
||||
if gdf.crs is None:
|
||||
raise ValueError("CRS tidak terdeteksi. Pastikan file memiliki informasi proyeksi (.prj).")
|
||||
|
||||
gdf = gdf.to_crs(epsg=4326)
|
||||
|
||||
print(f"[INFO] Berhasil membaca {len(gdf)} fitur")
|
||||
return gdf
|
||||
288
app/mapset_pipeline/core/readers/reader_pdf.py
Executable file
288
app/mapset_pipeline/core/readers/reader_pdf.py
Executable file
|
|
@ -0,0 +1,288 @@
|
|||
import re
|
||||
import pdfplumber
|
||||
import pandas as pd
|
||||
from app.mapset_pipeline.utils.pdf_cleaner import get_number_column_index, get_start_end_number, normalize_number_column, row_ratio, has_mixed_text_and_numbers, is_short_text_row, parse_page_selection, filter_geo_admin_column, cleaning_column
|
||||
from services.upload_file.upload_exceptions import PDFReadError
|
||||
from utils.logger_config import setup_logger
|
||||
|
||||
logger = setup_logger(__name__)
|
||||
|
||||
def detect_header_rows(rows):
|
||||
if not rows:
|
||||
return []
|
||||
|
||||
ratios = [row_ratio(r) for r in rows]
|
||||
body_start_index = None
|
||||
|
||||
for i in range(1, len(rows)):
|
||||
row = rows[i]
|
||||
if has_mixed_text_and_numbers(row):
|
||||
body_start_index = i
|
||||
break
|
||||
if ratios[i] > 0.3:
|
||||
body_start_index = i
|
||||
break
|
||||
if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row):
|
||||
body_start_index = i
|
||||
break
|
||||
if ratios[i - 1] == 0 and ratios[i] > 0:
|
||||
body_start_index = i
|
||||
break
|
||||
|
||||
if body_start_index is None:
|
||||
body_start_index = len(rows)
|
||||
|
||||
potential_headers = rows[:body_start_index]
|
||||
body_filtered = rows[body_start_index:]
|
||||
header_filtered = []
|
||||
for idx, row in enumerate(potential_headers):
|
||||
if is_short_text_row(row):
|
||||
if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0:
|
||||
header_filtered.append(row)
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
header_filtered.append(row)
|
||||
|
||||
return header_filtered, body_filtered
|
||||
|
||||
|
||||
def merge_multiline_header(header_rows):
|
||||
final_header = []
|
||||
for col in zip(*header_rows):
|
||||
val = next((v for v in reversed(col) if v and str(v).strip()), '')
|
||||
val = str(val).replace('\n', ' ').strip()
|
||||
final_header.append(val)
|
||||
final_header = [v for v in final_header if v not in ['', None]]
|
||||
return final_header
|
||||
|
||||
def merge_parsed_table(tables):
|
||||
roots = []
|
||||
fragments = []
|
||||
|
||||
# STEP 1: klasifikasi
|
||||
for table in tables:
|
||||
num_idx = get_number_column_index(table["columns"])
|
||||
if num_idx is None:
|
||||
roots.append(table)
|
||||
continue
|
||||
|
||||
start_no, _ = get_start_end_number(table["rows"], num_idx)
|
||||
if start_no == 1:
|
||||
roots.append(table)
|
||||
else:
|
||||
fragments.append(table)
|
||||
|
||||
# STEP 2: merge fragment ke root
|
||||
for frag in fragments:
|
||||
frag_idx = get_number_column_index(frag["columns"])
|
||||
f_start, _ = get_start_end_number(frag["rows"], frag_idx)
|
||||
|
||||
for root in roots:
|
||||
if root["columns"] != frag["columns"]:
|
||||
continue
|
||||
|
||||
root_idx = get_number_column_index(root["columns"])
|
||||
_, r_end = get_start_end_number(root["rows"], root_idx)
|
||||
|
||||
if f_start == r_end + 1:
|
||||
root["rows"].extend(frag["rows"])
|
||||
break # fragment hanya boleh nempel ke 1 root
|
||||
|
||||
return roots
|
||||
|
||||
|
||||
def read_pdf(path: str, page: str):
|
||||
"""
|
||||
Membaca tabel dari file PDF secara semi-otomatis menggunakan `pdfplumber`.
|
||||
|
||||
Alur utama proses:
|
||||
1. **Buka file PDF** menggunakan pdfplumber.
|
||||
2. **Pilih halaman** berdasarkan input `page` (misalnya "1,3-5" untuk halaman 1 dan 3–5).
|
||||
3. **Deteksi tabel** di setiap halaman yang dipilih.
|
||||
4. **Ekstraksi tabel mentah** (list of list) dari setiap halaman.
|
||||
5. **Pisahkan baris header dan body** dengan fungsi `detect_header_rows()`.
|
||||
6. **Gabungkan header multi-baris** (misalnya tabel dengan dua baris judul kolom).
|
||||
7. **Bersihkan body tabel** menggunakan `cleaning_column()`:
|
||||
- Menghapus kolom nomor urut.
|
||||
- Menyesuaikan jumlah kolom dengan header.
|
||||
8. **Gabungkan hasil akhir** ke dalam format JSON dengan struktur:
|
||||
{
|
||||
"title": <nomor tabel>,
|
||||
"columns": [...],
|
||||
"rows": [...]
|
||||
}
|
||||
9. **Filter tambahan** dengan `filter_geo_admin_column()` (khusus metadata geospasial).
|
||||
10. **Kembalikan hasil** berupa list JSON siap dikirim ke frontend API.
|
||||
|
||||
Args:
|
||||
path (str): Lokasi file PDF yang akan dibaca.
|
||||
page (str): Nomor halaman atau rentang halaman, contoh: "1", "2-4", "1,3-5".
|
||||
|
||||
Returns:
|
||||
list[dict]: Daftar tabel hasil ekstraksi dengan struktur kolom dan baris.
|
||||
|
||||
Raises:
|
||||
PDFReadError: Jika terjadi kesalahan saat membaca atau parsing PDF.
|
||||
"""
|
||||
# try:
|
||||
# pdf_path = path
|
||||
# selectedPage = page if page else "1"
|
||||
# tables_data = []
|
||||
|
||||
# with pdfplumber.open(pdf_path) as pdf:
|
||||
# total_pages = len(pdf.pages)
|
||||
# selected_pages = parse_page_selection(selectedPage, total_pages)
|
||||
|
||||
# logger.info(f"[INFO] Total Halaman PDF: {total_pages}")
|
||||
# logger.info(f"[INFO] Total Halaman yang dipilih: {len(selected_pages)}")
|
||||
# logger.info(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
|
||||
|
||||
# for page_num in selected_pages:
|
||||
# pdf_page = pdf.pages[page_num - 1]
|
||||
# tables = pdf_page.find_tables()
|
||||
# logger.info(f"\n\n[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
|
||||
|
||||
# # pembacaan title ini tidak valid untuk halaman lanscape
|
||||
# # for line in pdf_page.extract_text_lines():
|
||||
# # if line['top'] > tables[0].bbox[1]:
|
||||
# # break
|
||||
# # previous_line = line
|
||||
# # print('[TITLE]', previous_line['text'])
|
||||
|
||||
# for i, t in enumerate(tables, start=1):
|
||||
# table = t.extract()
|
||||
# if len(table) > 2:
|
||||
# print(f"[TBL] tabel : {i} - halaman {page_num}")
|
||||
# tables_data.append(table)
|
||||
|
||||
# logger.info(f"\nTotal tabel terbaca: {len(tables_data)}\n")
|
||||
|
||||
# header_only, body_only = [], []
|
||||
# for tbl in tables_data:
|
||||
# head, body = detect_header_rows(tbl)
|
||||
# header_only.append(head)
|
||||
# body_only.append(body)
|
||||
|
||||
# clean_header = [merge_multiline_header(h) for h in header_only]
|
||||
# clean_body = []
|
||||
|
||||
# for i, raw_body in enumerate(body_only):
|
||||
# con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
|
||||
# cleaned = cleaning_column(clean_header[i], [con_body])
|
||||
# clean_body.append(cleaned[0])
|
||||
|
||||
# parsed = []
|
||||
# for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
|
||||
# parsed.append({
|
||||
# "title": str(i),
|
||||
# "columns": cols,
|
||||
# "rows": rows
|
||||
# })
|
||||
|
||||
# # =================================================================
|
||||
|
||||
# clean_parsed = filter_geo_admin_column(parsed)
|
||||
# merge_parsed = merge_parsed_table(clean_parsed)
|
||||
|
||||
# logger.info(f"\nTotal tabel valid: {len(merge_parsed)}\n")
|
||||
|
||||
# ordered_tables = [normalize_number_column(t) for t in merge_parsed]
|
||||
# return ordered_tables
|
||||
|
||||
# except Exception as e:
|
||||
# raise PDFReadError(f"Gagal membaca PDF: {e}", code=422)
|
||||
|
||||
try:
|
||||
pdf_path = path
|
||||
selectedPage = page if page else "1"
|
||||
tables_data = []
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
total_pages = len(pdf.pages)
|
||||
selected_pages = parse_page_selection(selectedPage, total_pages)
|
||||
|
||||
logger.info(f"[INFO] Total Halaman PDF: {total_pages}")
|
||||
logger.info(f"[INFO] Total Halaman yang dipilih: {len(selected_pages)}")
|
||||
logger.info(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
|
||||
|
||||
for page_num in selected_pages:
|
||||
pdf_page = pdf.pages[page_num - 1]
|
||||
tables = pdf_page.find_tables()
|
||||
logger.info(f"\n\n[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
|
||||
|
||||
# pembacaan title ini tidak valid untuk halaman lanscape
|
||||
# for line in pdf_page.extract_text_lines():
|
||||
# if line['top'] > tables[0].bbox[1]:
|
||||
# break
|
||||
# previous_line = line
|
||||
# print('[TITLE]', previous_line['text'])
|
||||
|
||||
for i, t in enumerate(tables, start=1):
|
||||
table = t.extract()
|
||||
if len(table) > 2:
|
||||
print(f"[TBL] tabel : {i} - halaman {page_num}")
|
||||
tables_data.append({"page": f"halaman {page_num} - {i}", "table": table})
|
||||
|
||||
logger.info(f"\nTotal tabel terbaca: {len(tables_data)}\n")
|
||||
|
||||
header_only, body_only, page_info = [], [], []
|
||||
for tbl in tables_data:
|
||||
head, body = detect_header_rows(tbl["table"])
|
||||
header_only.append(head)
|
||||
body_only.append(body)
|
||||
page_info.append(tbl["page"])
|
||||
|
||||
clean_header = [merge_multiline_header(h) for h in header_only]
|
||||
clean_body = []
|
||||
|
||||
for i, raw_body in enumerate(body_only):
|
||||
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
|
||||
cleaned = cleaning_column(clean_header[i], [con_body])
|
||||
clean_body.append(cleaned[0])
|
||||
|
||||
parsed = []
|
||||
for i, (cols, rows, page) in enumerate(zip(clean_header, clean_body, page_info), start=1):
|
||||
parsed.append({
|
||||
"title": page,
|
||||
"columns": cols,
|
||||
"rows": rows
|
||||
})
|
||||
|
||||
# =================================================================
|
||||
|
||||
clean_parsed = filter_geo_admin_column(parsed)
|
||||
merge_parsed = merge_parsed_table(clean_parsed)
|
||||
|
||||
logger.info(f"\nTotal tabel valid: {len(merge_parsed)}\n")
|
||||
|
||||
ordered_tables = [normalize_number_column(t) for t in merge_parsed]
|
||||
return ordered_tables
|
||||
|
||||
except Exception as e:
|
||||
raise PDFReadError(f"Gagal membaca PDF: {e}", code=422)
|
||||
|
||||
|
||||
def convert_df(payload):
|
||||
try:
|
||||
if "columns" not in payload or "rows" not in payload:
|
||||
raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.")
|
||||
|
||||
if not isinstance(payload["columns"], list):
|
||||
raise TypeError("'columns' harus berupa list.")
|
||||
if not isinstance(payload["rows"], list):
|
||||
raise TypeError("'rows' harus berupa list.")
|
||||
|
||||
for i, row in enumerate(payload["rows"]):
|
||||
if len(row) != len(payload["columns"]):
|
||||
raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.")
|
||||
|
||||
df = pd.DataFrame(payload["rows"], columns=payload["columns"])
|
||||
|
||||
if "title" in payload:
|
||||
df.attrs["title"] = payload["title"]
|
||||
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
raise PDFReadError(f"Gagal konversi payload ke DataFrame: {e}", code=400)
|
||||
60
app/mapset_pipeline/core/readers/reader_shp.py
Executable file
60
app/mapset_pipeline/core/readers/reader_shp.py
Executable file
|
|
@ -0,0 +1,60 @@
|
|||
import geopandas as gpd
|
||||
import fiona
|
||||
import zipfile
|
||||
import tempfile
|
||||
import os
|
||||
import shutil
|
||||
from shapely.geometry import shape
|
||||
|
||||
def read_shp(path: str):
|
||||
if not path:
|
||||
raise ValueError("Path shapefile tidak boleh kosong.")
|
||||
|
||||
tmpdir = None
|
||||
shp_path = None
|
||||
|
||||
if path.lower().endswith(".zip"):
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
with zipfile.ZipFile(path, "r") as zip_ref:
|
||||
zip_ref.extractall(tmpdir)
|
||||
|
||||
shp_files = []
|
||||
for root, _, files in os.walk(tmpdir):
|
||||
for f in files:
|
||||
if f.lower().endswith(".shp"):
|
||||
shp_files.append(os.path.join(root, f))
|
||||
|
||||
if not shp_files:
|
||||
raise ValueError("Tidak ditemukan file .shp di dalam ZIP.")
|
||||
shp_path = shp_files[0]
|
||||
print(f"[DEBUG] Membaca shapefile: {os.path.basename(shp_path)}")
|
||||
|
||||
else:
|
||||
shp_path = path
|
||||
|
||||
try:
|
||||
gdf = gpd.read_file(shp_path)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Gagal membaca shapefile: {e}")
|
||||
|
||||
if "geometry" not in gdf.columns or gdf.geometry.is_empty.all():
|
||||
print("[WARN] Geometry kosong. Mencoba membangun ulang dari fitur mentah...")
|
||||
|
||||
with fiona.open(shp_path) as src:
|
||||
features = []
|
||||
for feat in src:
|
||||
geom = shape(feat["geometry"]) if feat["geometry"] else None
|
||||
props = feat["properties"]
|
||||
props["geometry"] = geom
|
||||
features.append(props)
|
||||
|
||||
gdf = gpd.GeoDataFrame(features, geometry="geometry", crs=src.crs)
|
||||
|
||||
if gdf.crs is None:
|
||||
# print("[WARN] CRS tidak terdeteksi. Diasumsikan EPSG:4326")
|
||||
gdf.set_crs("EPSG:4326", inplace=True)
|
||||
|
||||
if tmpdir and os.path.exists(tmpdir):
|
||||
shutil.rmtree(tmpdir)
|
||||
|
||||
return gdf
|
||||
259
app/mapset_pipeline/data/repository.py
Executable file
259
app/mapset_pipeline/data/repository.py
Executable file
|
|
@ -0,0 +1,259 @@
|
|||
import os
|
||||
import json
|
||||
import asyncio
|
||||
import pandas as pd
|
||||
from shapely import wkt, wkb
|
||||
from shapely.geometry import MultiPolygon, MultiLineString
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
||||
# Import koneksi database Anda
|
||||
from database.connection import engine
|
||||
from app.mapset_pipeline.utils.formatters import str_to_date
|
||||
|
||||
|
||||
async def generate_unique_table_name(base_name: str) -> str:
|
||||
"""Generate nama tabel unik, menambahkan suffix angka jika sudah ada."""
|
||||
base_name = base_name.lower().replace(" ", "_").replace("-", "_")
|
||||
table_name = base_name
|
||||
counter = 2
|
||||
|
||||
async with engine.connect() as conn:
|
||||
while True:
|
||||
# Cek keberadaan tabel di schema public (atau default search path)
|
||||
result = await conn.execute(
|
||||
text("SELECT to_regclass(:tname)"),
|
||||
{"tname": table_name}
|
||||
)
|
||||
exists = result.scalar()
|
||||
|
||||
if not exists:
|
||||
return table_name
|
||||
|
||||
table_name = f"{base_name}_{counter}"
|
||||
counter += 1
|
||||
|
||||
|
||||
async def insert_parquet_to_postgis(filename: str, table_name: str):
|
||||
"""
|
||||
Membaca file parquet sementara, membersihkan data, dan melakukan COPY
|
||||
ke PostGIS menggunakan asyncpg pool untuk performa tinggi.
|
||||
"""
|
||||
from main import db_pool
|
||||
file_path = os.path.join("tmp", filename)
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File temp {file_path} tidak ditemukan")
|
||||
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
# Baca parquet (CPU bound, run in executor jika file sangat besar)
|
||||
df = await loop.run_in_executor(None, pd.read_parquet, file_path)
|
||||
|
||||
# 1. CLEANING NAMA KOLOM
|
||||
df.columns = [str(col).strip().upper() for col in df.columns]
|
||||
|
||||
# Standarisasi kolom GEOM
|
||||
if "GEOM" in df.columns:
|
||||
df.rename(columns={"GEOM": "GEOM"}, inplace=True)
|
||||
|
||||
if "GEOM" not in df.columns:
|
||||
raise ValueError("Kolom GEOM tidak ditemukan dalam Parquet")
|
||||
|
||||
# 2. PREPARE DATA ROWS
|
||||
clean_rows = []
|
||||
geom_types = set()
|
||||
|
||||
# Atribut selain GEOM
|
||||
attr_columns = [col for col in df.columns if col != "GEOM"]
|
||||
|
||||
for row in df.itertuples(index=False):
|
||||
# --- Handle GEOM ---
|
||||
raw_geom = getattr(row, "GEOM", None)
|
||||
if not raw_geom: continue
|
||||
|
||||
try:
|
||||
geom = None
|
||||
if isinstance(raw_geom, str):
|
||||
geom = wkt.loads(raw_geom)
|
||||
elif isinstance(raw_geom, bytes):
|
||||
geom = wkb.loads(raw_geom)
|
||||
|
||||
if not geom: continue
|
||||
|
||||
# Fix Invalid Geometry
|
||||
if not geom.is_valid:
|
||||
geom = geom.buffer(0)
|
||||
|
||||
# Force Multi-Geometry agar seragam
|
||||
gtype = geom.geom_type.upper()
|
||||
if gtype == "POLYGON": geom = MultiPolygon([geom])
|
||||
elif gtype == "LINESTRING": geom = MultiLineString([geom])
|
||||
|
||||
geom_types.add(geom.geom_type)
|
||||
|
||||
# Convert ke EWKT (SRID 4326)
|
||||
ewkt = f"SRID=4326;{geom.wkt}"
|
||||
|
||||
except Exception:
|
||||
continue # Skip baris dengan geom rusak
|
||||
|
||||
# --- Handle Attributes (FORCE STRING) ---
|
||||
row_data = []
|
||||
for col in attr_columns:
|
||||
val = getattr(row, col, None)
|
||||
if val is not None:
|
||||
row_data.append(str(val))
|
||||
else:
|
||||
row_data.append(None)
|
||||
|
||||
row_data.append(ewkt)
|
||||
clean_rows.append(tuple(row_data))
|
||||
|
||||
if not clean_rows:
|
||||
raise ValueError("Data valid kosong setelah pemrosesan geometry")
|
||||
|
||||
# 3. DATABASE OPERATIONS
|
||||
final_geom_type = list(geom_types)[0].upper() if geom_types else "GEOM"
|
||||
if "MULTI" not in final_geom_type and final_geom_type != "GEOM":
|
||||
final_geom_type = "MULTI" + final_geom_type
|
||||
|
||||
# A. CREATE TABLE
|
||||
col_defs = [f'"{col}" TEXT' for col in attr_columns] # Semua atribut jadi TEXT dulu agar aman
|
||||
|
||||
create_sql = f"""
|
||||
CREATE TABLE {table_name} (
|
||||
_id SERIAL PRIMARY KEY,
|
||||
{', '.join(col_defs)},
|
||||
geom TEXT
|
||||
);
|
||||
"""
|
||||
|
||||
async with db_pool.acquire() as conn:
|
||||
# Create Table
|
||||
await conn.execute(create_sql)
|
||||
|
||||
# B. COPY Data (Bulk Insert)
|
||||
target_cols = attr_columns + ['geom']
|
||||
# asyncpg otomatis meng-quote nama kolom
|
||||
await conn.copy_records_to_table(
|
||||
table_name,
|
||||
records=clean_rows,
|
||||
columns=target_cols
|
||||
)
|
||||
|
||||
# C. ALTER COLUMN GEOMETRY & INDEX
|
||||
alter_sql = f"""
|
||||
ALTER TABLE {table_name}
|
||||
ALTER COLUMN geom TYPE geometry({final_geom_type}, 4326)
|
||||
USING ST_Force2D(geom::geometry)::geometry({final_geom_type}, 4326);
|
||||
|
||||
CREATE INDEX idx_{table_name}_geom ON {table_name} USING GIST (geom);
|
||||
"""
|
||||
await conn.execute(alter_sql)
|
||||
|
||||
print(f"[SUCCESS] Upload {len(clean_rows)} baris ke tabel {table_name}.")
|
||||
|
||||
# Hapus file temp setelah sukses
|
||||
try:
|
||||
os.remove(file_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return {
|
||||
"table_name": table_name,
|
||||
"row_count": len(clean_rows),
|
||||
"geom_type": final_geom_type
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Processing parquet to DB: {e}")
|
||||
raise e
|
||||
|
||||
|
||||
async def save_author_metadata(payload_author: dict, table_name: str, dataset_title: str,
|
||||
geom_types: list, row_count: int, user_id: int):
|
||||
"""
|
||||
Menyimpan metadata author dan informasi dataset ke tabel backend.author_metadata.
|
||||
"""
|
||||
query = text("""
|
||||
INSERT INTO backend.author_metadata (
|
||||
table_title,
|
||||
dataset_title,
|
||||
dataset_abstract,
|
||||
keywords,
|
||||
topic_category,
|
||||
date_created,
|
||||
dataset_status,
|
||||
organization_name,
|
||||
contact_person_name,
|
||||
contact_email,
|
||||
contact_phone,
|
||||
geom_type,
|
||||
user_id,
|
||||
process,
|
||||
geometry_count
|
||||
) VALUES (
|
||||
:table_title,
|
||||
:dataset_title,
|
||||
:dataset_abstract,
|
||||
:keywords,
|
||||
:topic_category,
|
||||
:date_created,
|
||||
:dataset_status,
|
||||
:organization_name,
|
||||
:contact_person_name,
|
||||
:contact_email,
|
||||
:contact_phone,
|
||||
:geom_type,
|
||||
:user_id,
|
||||
:process,
|
||||
:geometry_count
|
||||
)
|
||||
""")
|
||||
|
||||
params = {
|
||||
"table_title": table_name,
|
||||
"dataset_title": dataset_title,
|
||||
"dataset_abstract": payload_author.get("abstract"),
|
||||
"keywords": payload_author.get("keywords"),
|
||||
"topic_category": ", ".join(payload_author.get("topicCategory", [])),
|
||||
"date_created": str_to_date(payload_author.get("dateCreated")),
|
||||
"dataset_status": payload_author.get("status"),
|
||||
"organization_name": payload_author.get("organization"),
|
||||
"contact_person_name": payload_author.get("contactName"),
|
||||
"contact_email": payload_author.get("contactEmail"),
|
||||
"contact_phone": payload_author.get("contactPhone"),
|
||||
"geom_type": json.dumps(geom_types),
|
||||
"user_id": user_id,
|
||||
"process": 'CLEANSING',
|
||||
"geometry_count": row_count
|
||||
}
|
||||
|
||||
async with engine.begin() as conn:
|
||||
await conn.execute(query, params)
|
||||
|
||||
|
||||
async def call_cleansing_procedure(table_name: str):
|
||||
"""
|
||||
Menjalankan stored procedure cleansing geometry di database.
|
||||
"""
|
||||
try:
|
||||
print(f"[INFO] Memulai cleansing database untuk tabel: {table_name}")
|
||||
|
||||
async with engine.begin() as conn:
|
||||
# Menggunakan parameter binding yang aman
|
||||
await conn.execute(
|
||||
text("CALL pr_cleansing_satupeta_polygon(:table_name, NULL);"),
|
||||
{"table_name": table_name}
|
||||
)
|
||||
|
||||
print(f"[SUCCESS] Cleansing selesai untuk tabel: {table_name}")
|
||||
return "done"
|
||||
|
||||
except SQLAlchemyError as e:
|
||||
print(f"[ERROR] Cleansing database gagal: {e}")
|
||||
# Kita raise error agar Service tahu kalau proses ini gagal
|
||||
raise RuntimeError(f"Database cleansing failed: {str(e)}")
|
||||
|
||||
|
||||
286
app/mapset_pipeline/service.py
Executable file
286
app/mapset_pipeline/service.py
Executable file
|
|
@ -0,0 +1,286 @@
|
|||
import os
|
||||
import shutil
|
||||
import pandas as pd
|
||||
from fastapi import UploadFile, HTTPException
|
||||
from typing import Optional
|
||||
|
||||
# --- Internal Modules ---
|
||||
from .api.schemas import UploadRequest, PdfRequest
|
||||
from .core.processing.analyzer import analyze_and_clean_dataframe, publish_mapset
|
||||
from .core.readers import (
|
||||
read_csv,
|
||||
read_shp,
|
||||
read_gdb,
|
||||
read_mpk,
|
||||
read_pdf,
|
||||
convert_df
|
||||
)
|
||||
from .data.repository import (
|
||||
generate_unique_table_name,
|
||||
insert_parquet_to_postgis,
|
||||
save_author_metadata,
|
||||
call_cleansing_procedure
|
||||
)
|
||||
|
||||
from app.mapset_pipeline.utils.file_ops import (
|
||||
detect_zip_type,
|
||||
generate_job_id,
|
||||
)
|
||||
from app.mapset_pipeline.utils.formatters import (
|
||||
save_xml_to_sld,
|
||||
)
|
||||
|
||||
# --- Legacy/External Modules (Sesuai kode asli Anda) ---
|
||||
from app.core.config import UPLOAD_FOLDER, MAX_FILE_MB, GEONETWORK_URL
|
||||
from utils.logger_config import log_activity
|
||||
|
||||
# from api.routers.datasets_router import (
|
||||
# upload_to_main
|
||||
# )
|
||||
|
||||
async def handle_file_analysis(
|
||||
file: UploadFile,
|
||||
page: Optional[str] = "",
|
||||
sheet: Optional[str] = "",
|
||||
fileDesc: Optional[str] = ""
|
||||
):
|
||||
"""
|
||||
Orchestrator untuk endpoint /upload.
|
||||
1. Simpan file fisik.
|
||||
2. Pilih Reader berdasarkan ekstensi.
|
||||
3. Panggil Processor untuk analisis.
|
||||
4. Bersihkan file fisik.
|
||||
"""
|
||||
fname = file.filename
|
||||
ext = os.path.splitext(fname)[1].lower()
|
||||
|
||||
# 1. Validasi & Simpan File
|
||||
# Membaca file in-memory untuk cek ukuran (hati-hati memory usage untuk file besar)
|
||||
contents = await file.read()
|
||||
size_mb = len(contents) / (1024 * 1024)
|
||||
if size_mb > MAX_FILE_MB:
|
||||
raise HTTPException(status_code=413, detail="Ukuran File Terlalu Besar")
|
||||
|
||||
tmp_path = UPLOAD_FOLDER / fname
|
||||
# Pastikan folder ada
|
||||
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
||||
|
||||
with open(tmp_path, "wb") as f:
|
||||
f.write(contents)
|
||||
|
||||
df = None
|
||||
try:
|
||||
# 2. Routing Reader Berdasarkan Ekstensi
|
||||
print(f"[INFO] Processing file type: {ext}")
|
||||
|
||||
if ext == ".csv":
|
||||
df = read_csv(str(tmp_path))
|
||||
elif ext == ".xlsx":
|
||||
df = read_csv(str(tmp_path), sheet) # Asumsi read_csv handle xlsx juga sesuai kode asli
|
||||
elif ext == ".mpk":
|
||||
df = read_mpk(str(tmp_path))
|
||||
elif ext == ".pdf":
|
||||
# Logic PDF agak unik, bisa return list tabel atau df
|
||||
tbl = read_pdf(tmp_path, page)
|
||||
if len(tbl) == 0:
|
||||
return {
|
||||
"message": "Tidak ditemukan tabel valid pada halaman yang dipilih",
|
||||
"tables": {},
|
||||
"file_type": ext
|
||||
}
|
||||
elif len(tbl) > 1:
|
||||
return {
|
||||
"message": "File berhasil dibaca, ditemukan banyak tabel.",
|
||||
"tables": tbl,
|
||||
"file_type": ext
|
||||
}
|
||||
else:
|
||||
df = convert_df(tbl[0])
|
||||
elif ext == ".zip":
|
||||
zip_type = detect_zip_type(str(tmp_path))
|
||||
if zip_type == "shp":
|
||||
df = read_shp(str(tmp_path))
|
||||
elif zip_type == "gdb":
|
||||
df = read_gdb(str(tmp_path))
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="ZIP file tidak mengandung SHP / GDB valid.")
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="Unsupported file type")
|
||||
|
||||
# Cek Dataframe Kosong
|
||||
if df is None or (hasattr(df, "empty") and df.empty):
|
||||
raise HTTPException(status_code=422, detail="File berhasil dibaca, tetapi tidak ditemukan tabel valid")
|
||||
|
||||
# 3. Panggil Processor (Logic Cleaning & Validasi)
|
||||
result_analysis = await analyze_and_clean_dataframe(df, ext, fname, fileDesc)
|
||||
return result_analysis
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] handle_file_analysis: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
finally:
|
||||
# 4. Cleanup Uploaded File (Raw File)
|
||||
# Kita hapus file upload asli, tapi file temp parquet (hasil processor)
|
||||
# tetap hidup sampai frontend mengirim request ingest
|
||||
if tmp_path.exists():
|
||||
try:
|
||||
os.remove(tmp_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
async def process_pdf_file(payload: PdfRequest):
|
||||
"""
|
||||
Helper khusus jika user mengupload PDF dan ingin memilih tabel tertentu.
|
||||
"""
|
||||
try:
|
||||
# Convert request body ke DataFrame (sesuai logic reader_pdf)
|
||||
# Kita mock convert_df karena di kode asli import dari reader_pdf
|
||||
# yang mungkin mengharapkan format dict khusus
|
||||
df = convert_df(payload.model_dump())
|
||||
|
||||
if df is None or (hasattr(df, "empty") and df.empty):
|
||||
raise HTTPException(status_code=422, detail="Tidak ada tabel valid dalam PDF")
|
||||
|
||||
# Reuse logic processor yang sama
|
||||
return await analyze_and_clean_dataframe(
|
||||
df, '.pdf', payload.fileName, payload.fileDesc
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
async def execute_postgis_ingestion(payload: UploadRequest, user_id: int):
|
||||
"""
|
||||
Orchestrator untuk endpoint /process-to-postgis.
|
||||
1. Terima data (JSON rows).
|
||||
2. Convert ke Parquet Temporary.
|
||||
3. Upload ke PostGIS (via Repository).
|
||||
4. Simpan Metadata (via Repository).
|
||||
5. Trigger Cleansing & Publishing.
|
||||
6. Logging.
|
||||
"""
|
||||
job_id = generate_job_id(str(user_id))
|
||||
|
||||
try:
|
||||
# 1. Generate Nama Tabel
|
||||
table_name = await generate_unique_table_name(payload.title)
|
||||
|
||||
# 2. Persiapan Data (JSON -> DataFrame -> Parquet)
|
||||
# Kita perlu save ke parquet karena repository insert_parquet_to_postgis membaca file
|
||||
# Ini juga memisahkan memory load antara API dan DB Process
|
||||
df = pd.DataFrame(payload.rows)
|
||||
|
||||
# Upper case columns
|
||||
df.columns = [col.upper() for col in df.columns]
|
||||
|
||||
# Rename Geometry jika perlu (standarisasi input dari frontend)
|
||||
if "GEOMETRY" in df.columns:
|
||||
df.rename(columns={"GEOMETRY": "GEOM"}, inplace=True)
|
||||
|
||||
# Simpan ke file temp untuk diproses repository
|
||||
temp_parquet_name = f"{job_id}.parquet"
|
||||
temp_parquet_path = os.path.join("tmp", temp_parquet_name)
|
||||
os.makedirs("tmp", exist_ok=True)
|
||||
|
||||
# Save parquet (gunakan engine pyarrow atau fastparquet)
|
||||
df.to_parquet(temp_parquet_path, index=False)
|
||||
|
||||
# 3. Insert ke PostGIS
|
||||
# Fungsi ini akan membaca file parquet tadi, membersihkan geom, dan copy ke DB
|
||||
db_result = await insert_parquet_to_postgis(temp_parquet_name, table_name)
|
||||
|
||||
# 4. Simpan Metadata
|
||||
# Ambil list geom type dan row count dari hasil insert DB (lebih akurat)
|
||||
final_geom_types = [db_result['geom_type']] # Disederhanakan jadi list
|
||||
row_count = db_result['row_count']
|
||||
|
||||
await save_author_metadata(
|
||||
payload_author=payload.author,
|
||||
table_name=table_name,
|
||||
dataset_title=payload.title,
|
||||
geom_types=final_geom_types,
|
||||
row_count=row_count,
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
# 5. Logging Activity
|
||||
await log_activity(
|
||||
user_id=user_id,
|
||||
action_type="UPLOAD",
|
||||
action_title=f"Upload dataset {table_name}",
|
||||
details={"table_name": table_name, "rows": row_count}
|
||||
)
|
||||
|
||||
# 6. Post-Processing (External APIs)
|
||||
result = {
|
||||
"job_id": job_id,
|
||||
"job_status": "wait",
|
||||
"table_name": table_name,
|
||||
"status": "success",
|
||||
"message": f"Tabel '{table_name}' berhasil dibuat.",
|
||||
"total_rows": row_count,
|
||||
"geometry_type": final_geom_types,
|
||||
"crs": payload.author.get("crs", "EPSG:4326"),
|
||||
"metadata_uuid": ""
|
||||
}
|
||||
|
||||
# Save Style (SLD)
|
||||
save_xml_to_sld(payload.style, job_id)
|
||||
|
||||
# CLEANSING WITH QUERY
|
||||
try:
|
||||
cleansing_status = await call_cleansing_procedure(table_name)
|
||||
except Exception as e:
|
||||
cleansing_status = "failed"
|
||||
print(f"Cleansing warning: {e}")
|
||||
result['job_status'] = cleansing_status
|
||||
|
||||
# Publish Layer (Geoserver/Geonetwork)
|
||||
publish_info = await publish_mapset(table_name, job_id)
|
||||
result['metadata_uuid'] = publish_info.get('uuid', '')
|
||||
|
||||
# 7. Upload to Main Portal (Mapset Integration)
|
||||
mapset_payload = {
|
||||
"name": payload.title,
|
||||
"description": payload.author.get("abstract"),
|
||||
"scale": "1:25000",
|
||||
# ID Hardcoded sesuai kode asli (pertimbangkan pindah ke config/env)
|
||||
'projection_system_id': '0196c746-d1ba-7f1c-9706-5df738679cc7',
|
||||
"category_id": payload.author.get("mapsetCategory"),
|
||||
"data_status": "sementara",
|
||||
'classification_id': '01968b4b-d3f9-76c9-888c-ee887ac31ce4',
|
||||
'producer_id': '01968b54-0000-7a67-bd10-975b8923b93e',
|
||||
"layer_type": final_geom_types[0],
|
||||
'source_id': ['019c03ef-35e1-738b-858d-871dc7d1e4d6'],
|
||||
"layer_url": publish_info.get('geos_link', ''),
|
||||
"metadata_url": f"{GEONETWORK_URL}/srv/eng/catalog.search#/metadata/{publish_info.get('uuid', '')}",
|
||||
"coverage_level": "provinsi",
|
||||
"coverage_area": "kabupaten",
|
||||
"data_update_period": "Tahunan",
|
||||
"data_version": "2026",
|
||||
"is_popular": False,
|
||||
"is_active": True,
|
||||
'regional_id': '01968b53-a910-7a67-bd10-975b8923b92e',
|
||||
"notes": "Mapset baru dibuat",
|
||||
"status_validation": "on_verification",
|
||||
}
|
||||
|
||||
# await upload_to_main(mapset_payload)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
# Error Handling & Logging
|
||||
await log_activity(
|
||||
user_id=user_id,
|
||||
action_type="ERROR",
|
||||
action_title="Upload gagal",
|
||||
details={"error": str(e)}
|
||||
)
|
||||
print(f"[ERROR] execute_postgis_ingestion: {e}")
|
||||
# Re-raise sebagai HTTP Exception agar router mengembalikan 500 yang rapi
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
105
app/mapset_pipeline/utils/file_ops.py
Executable file
105
app/mapset_pipeline/utils/file_ops.py
Executable file
|
|
@ -0,0 +1,105 @@
|
|||
import os
|
||||
import uuid
|
||||
import zipfile
|
||||
import geopandas as gpd
|
||||
from shapely import wkt
|
||||
from shapely.errors import ShapelyError
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def detect_zip_type(zip_path: str) -> str:
|
||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||
files = zip_ref.namelist()
|
||||
|
||||
if any(f.lower().endswith(".gdb/") or ".gdb/" in f.lower() for f in files):
|
||||
return "gdb"
|
||||
|
||||
if any(f.lower().endswith(ext) for ext in [".gdbtable", ".gdbtablx", ".gdbindexes", ".spx"] for f in files):
|
||||
return "gdb"
|
||||
|
||||
if any(f.lower().endswith(".shp") for f in files):
|
||||
return "shp"
|
||||
|
||||
return "unknown"
|
||||
|
||||
|
||||
def generate_unique_filename(folder="tmp", ext="parquet", digits=6):
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
while True:
|
||||
file_id = file_id = uuid.uuid4().int
|
||||
filename = f"{folder}/{file_id}.{ext}"
|
||||
|
||||
if not os.path.exists(filename):
|
||||
return filename
|
||||
|
||||
|
||||
def generate_job_id(user_id: str) -> str:
|
||||
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
return f"{user_id}_{timestamp}"
|
||||
|
||||
|
||||
def dataframe_validation(df_input, tmp_file):
|
||||
"""
|
||||
Fungsi ini berjalan di thread terpisah (CPU bound).
|
||||
Melakukan validasi, cleaning, dan export ke parquet.
|
||||
"""
|
||||
# 1. Copy agar tidak mengubah data asli
|
||||
export_df = df_input.copy()
|
||||
|
||||
# =========================================================================
|
||||
# TAHAP 1: SAFE WKT LOADING
|
||||
# =========================================================================
|
||||
def safe_load_wkt(raw):
|
||||
if not isinstance(raw, str):
|
||||
return None
|
||||
try:
|
||||
return wkt.loads(raw)
|
||||
# 2. GANTI CATCH BLOCK INI
|
||||
# except (WKTReadingError, Exception): <-- LAMA
|
||||
except (ShapelyError, Exception):
|
||||
return None
|
||||
|
||||
# Terapkan safe load
|
||||
export_df["geom"] = export_df["geometry"].apply(safe_load_wkt)
|
||||
|
||||
# =========================================================================
|
||||
# TAHAP 2: FILTER NULL & INVALID GEOMETRY
|
||||
# =========================================================================
|
||||
# Hapus baris di mana konversi WKT gagal (None)
|
||||
export_df = export_df[export_df["geom"].notnull()]
|
||||
print("df", export_df)
|
||||
if export_df.empty:
|
||||
raise ValueError("Tidak ada data spasial valid yang ditemukan.")
|
||||
|
||||
# Jadikan GeoDataFrame
|
||||
export_df = gpd.GeoDataFrame(export_df, geometry="geom")
|
||||
|
||||
# =========================================================================
|
||||
# TAHAP 3: FIX TOPOLOGY (PENTING!)
|
||||
# =========================================================================
|
||||
# Cek validitas (misal: Polygon yang garisnya menabrak diri sendiri)
|
||||
# buffer(0) adalah trik standar GIS untuk memperbaiki topologi ringan
|
||||
export_df["geom"] = export_df["geom"].apply(
|
||||
lambda g: g.buffer(0) if not g.is_valid else g
|
||||
)
|
||||
|
||||
# Hapus lagi jika setelah di-fix malah jadi kosong (jarang terjadi, tapi aman)
|
||||
export_df = export_df[~export_df["geom"].is_empty]
|
||||
|
||||
# =========================================================================
|
||||
# TAHAP 4: FINALISASI (CRS & RENAME)
|
||||
# =========================================================================
|
||||
export_df = export_df.drop(columns=["geometry"]) # Buang kolom string WKT lama
|
||||
export_df = export_df.set_crs("EPSG:4326", allow_override=True)
|
||||
|
||||
# Rename kolom atribut ke UPPERCASE, biarkan 'geom' lowercase
|
||||
# .strip() untuk membuang spasi hantu (" ID " -> "ID")
|
||||
export_df = export_df.rename(
|
||||
columns=lambda c: str(c).strip().upper() if c != "geom" else c
|
||||
)
|
||||
|
||||
# Simpan ke Parquet
|
||||
export_df.to_parquet(tmp_file)
|
||||
|
||||
return len(export_df)
|
||||
|
||||
43
app/mapset_pipeline/utils/formatters.py
Executable file
43
app/mapset_pipeline/utils/formatters.py
Executable file
|
|
@ -0,0 +1,43 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from shapely.geometry import base as shapely_base
|
||||
from shapely.geometry.base import BaseGeometry
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def safe_json(value):
|
||||
"""Konversi aman untuk semua tipe numpy/pandas/shapely ke tipe JSON-serializable"""
|
||||
if isinstance(value, (np.int64, np.int32)):
|
||||
return int(value)
|
||||
if isinstance(value, (np.float64, np.float32)):
|
||||
return float(value)
|
||||
if isinstance(value, pd.Timestamp):
|
||||
return value.isoformat()
|
||||
if isinstance(value, shapely_base.BaseGeometry):
|
||||
return str(value) # convert to WKT string
|
||||
if pd.isna(value):
|
||||
return None
|
||||
return value
|
||||
|
||||
|
||||
def str_to_date(raw_date: str):
|
||||
if raw_date:
|
||||
try:
|
||||
return datetime.strptime(raw_date, "%Y-%m-%d").date()
|
||||
except Exception as e:
|
||||
print("[WARNING] Tidak bisa parse dateCreated:", e)
|
||||
return None
|
||||
|
||||
|
||||
def save_xml_to_sld(xml_string, filename):
|
||||
folder_path = 'style_temp'
|
||||
os.makedirs(folder_path, exist_ok=True)
|
||||
|
||||
file_path = os.path.join(folder_path, f"{filename}.sld")
|
||||
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(xml_string)
|
||||
|
||||
return file_path
|
||||
|
||||
208
app/mapset_pipeline/utils/pdf_cleaner.py
Executable file
208
app/mapset_pipeline/utils/pdf_cleaner.py
Executable file
|
|
@ -0,0 +1,208 @@
|
|||
import re
|
||||
import itertools
|
||||
|
||||
geo_admin_keywords = [
|
||||
'lat', 'lon', 'long', 'latitude', 'longitude', 'koordinat', 'geometry', 'geometri',
|
||||
'desa', 'kelurahan', 'kel', 'kecamatan', 'kabupaten', 'kab', 'kota', 'provinsi',
|
||||
'lokasi', 'region', 'area', 'zone', 'boundary', 'batas'
|
||||
]
|
||||
|
||||
def normalize_text(text):
|
||||
text = text.lower()
|
||||
text = re.sub(r'[^a-z0-9/ ]+', ' ', text)
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text
|
||||
|
||||
def generate_combined_patterns(keywords):
|
||||
combos = list(itertools.combinations(keywords, 2))
|
||||
patterns = []
|
||||
for a, b in combos:
|
||||
patterns.append(rf'{a}\s*/\s*{b}')
|
||||
patterns.append(rf'{b}\s*/\s*{a}')
|
||||
return patterns
|
||||
|
||||
combined_patterns = generate_combined_patterns(geo_admin_keywords)
|
||||
|
||||
def contains_geo_admin_keywords(text):
|
||||
text_clean = normalize_text(text)
|
||||
if len(text_clean) < 3:
|
||||
return False
|
||||
|
||||
for pattern in combined_patterns:
|
||||
if re.search(pattern, text_clean):
|
||||
return True
|
||||
|
||||
for kw in geo_admin_keywords:
|
||||
if re.search(rf'(^|[\s/_-]){kw}([\s/_-]|$)', text_clean):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def filter_geo_admin_column(tables):
|
||||
filtered = []
|
||||
for table in tables:
|
||||
found = any(contains_geo_admin_keywords(col) for col in table['columns'])
|
||||
if found:
|
||||
filtered.append(table)
|
||||
return filtered
|
||||
|
||||
|
||||
NUMBER_HEADER_KEYWORDS = [
|
||||
"no","no.","nomor","nomor urut","no urut","No","Nomor","No Urut","Index",
|
||||
"ID","Sr No","S/N","SN","Sl No"
|
||||
]
|
||||
|
||||
def has_number_header(header):
|
||||
header_text = header
|
||||
return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS)
|
||||
|
||||
def is_numbering_column(col_values):
|
||||
numeric_like = 0
|
||||
total = 0
|
||||
for v in col_values:
|
||||
if not v or not isinstance(v, str):
|
||||
continue
|
||||
total += 1
|
||||
if re.fullmatch(r"0*\d{1,3}", v.strip()):
|
||||
numeric_like += 1
|
||||
return total > 0 and (numeric_like / total) > 0.6
|
||||
|
||||
def is_numeric_value(v):
|
||||
if v is None:
|
||||
return False
|
||||
if isinstance(v, (int, float)):
|
||||
return True
|
||||
if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()):
|
||||
return True
|
||||
return False
|
||||
|
||||
def cleaning_column(headers, bodies):
|
||||
cleaned_bodies = []
|
||||
|
||||
for header, body in zip(headers, bodies):
|
||||
if not body:
|
||||
cleaned_bodies.append(body)
|
||||
continue
|
||||
|
||||
header_has_number = has_number_header(header)
|
||||
first_col = [row[0] for row in body if row and len(row) > 0]
|
||||
first_col_is_numbering = is_numbering_column(first_col)
|
||||
|
||||
if not header_has_number and first_col_is_numbering:
|
||||
new_body = []
|
||||
for row in body:
|
||||
if not row:
|
||||
continue
|
||||
first_val = row[0]
|
||||
if is_numeric_value(first_val) and len(row) > 1:
|
||||
new_body.append(row[1:])
|
||||
else:
|
||||
new_body.append(row)
|
||||
body = new_body
|
||||
|
||||
header_len = len(headers)
|
||||
filtered_body = [row for row in body if len(row) == header_len]
|
||||
|
||||
cleaned_bodies.append(filtered_body)
|
||||
|
||||
return cleaned_bodies
|
||||
|
||||
def parse_page_selection(selectedPage: str, total_pages: int):
|
||||
if not selectedPage:
|
||||
return list(range(1, total_pages + 1))
|
||||
|
||||
pages = set()
|
||||
parts = re.split(r'[,\s]+', selectedPage.strip())
|
||||
|
||||
for part in parts:
|
||||
if '-' in part:
|
||||
try:
|
||||
start, end = map(int, part.split('-'))
|
||||
pages.update(range(start, end + 1))
|
||||
except ValueError:
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
pages.add(int(part))
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages]
|
||||
return valid_pages
|
||||
|
||||
def is_number(s):
|
||||
if s is None:
|
||||
return False
|
||||
s = str(s).strip().replace(',', '').replace('.', '')
|
||||
return s.isdigit()
|
||||
|
||||
def row_ratio(row):
|
||||
non_empty = [c for c in row if c not in (None, '', ' ')]
|
||||
if not non_empty:
|
||||
return 0
|
||||
num_count = sum(is_number(c) for c in non_empty)
|
||||
return num_count / len(non_empty)
|
||||
|
||||
def has_mixed_text_and_numbers(row):
|
||||
non_empty = [c for c in row if c not in (None, '', ' ')]
|
||||
has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty)
|
||||
has_num = any(is_number(c) for c in non_empty)
|
||||
return has_text and has_num
|
||||
|
||||
def is_short_text_row(row):
|
||||
"""Deteksi baris teks pendek (1-2 kolom teks pendek)."""
|
||||
non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')]
|
||||
if not non_empty:
|
||||
return False
|
||||
text_only = all(not is_number(c) for c in non_empty)
|
||||
joined = " ".join(non_empty)
|
||||
return text_only and len(non_empty) <= 2 and len(joined) < 20
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def get_number_column_index(columns):
|
||||
for i, col in enumerate(columns):
|
||||
if has_number_header(col):
|
||||
return i
|
||||
return None
|
||||
|
||||
def get_start_end_number(rows, idx):
|
||||
try:
|
||||
start_no = int(rows[0][idx])
|
||||
end_no = int(rows[-1][idx])
|
||||
return start_no, end_no
|
||||
except:
|
||||
return None, None
|
||||
|
||||
def normalize_number_column(table):
|
||||
columns = table["columns"]
|
||||
rows = table["rows"]
|
||||
|
||||
num_idx = get_number_column_index(columns)
|
||||
if num_idx is None:
|
||||
return table
|
||||
|
||||
current = None
|
||||
|
||||
for row in rows:
|
||||
try:
|
||||
val = int(row[num_idx])
|
||||
except:
|
||||
continue
|
||||
|
||||
if current is None:
|
||||
current = val
|
||||
else:
|
||||
if val <= current:
|
||||
current += 1
|
||||
else:
|
||||
current = val
|
||||
|
||||
row[num_idx] = str(current)
|
||||
|
||||
return table
|
||||
0
app/models/__init__.py
Normal file → Executable file
0
app/models/__init__.py
Normal file → Executable file
0
app/models/base.py
Normal file → Executable file
0
app/models/base.py
Normal file → Executable file
0
app/models/category_model.py
Normal file → Executable file
0
app/models/category_model.py
Normal file → Executable file
0
app/models/classification_model.py
Normal file → Executable file
0
app/models/classification_model.py
Normal file → Executable file
0
app/models/credential_model.py
Normal file → Executable file
0
app/models/credential_model.py
Normal file → Executable file
0
app/models/feedback_model.py
Normal file → Executable file
0
app/models/feedback_model.py
Normal file → Executable file
0
app/models/file_model.py
Normal file → Executable file
0
app/models/file_model.py
Normal file → Executable file
0
app/models/map_access_model.py
Normal file → Executable file
0
app/models/map_access_model.py
Normal file → Executable file
0
app/models/map_projection_system_model.py
Normal file → Executable file
0
app/models/map_projection_system_model.py
Normal file → Executable file
0
app/models/map_source_model.py
Normal file → Executable file
0
app/models/map_source_model.py
Normal file → Executable file
0
app/models/mapset_history_model.py
Normal file → Executable file
0
app/models/mapset_history_model.py
Normal file → Executable file
0
app/models/mapset_model.py
Normal file → Executable file
0
app/models/mapset_model.py
Normal file → Executable file
0
app/models/news_model.py
Normal file → Executable file
0
app/models/news_model.py
Normal file → Executable file
0
app/models/organization_model.py
Normal file → Executable file
0
app/models/organization_model.py
Normal file → Executable file
0
app/models/refresh_token_model.py
Normal file → Executable file
0
app/models/refresh_token_model.py
Normal file → Executable file
0
app/models/regional_model.py
Normal file → Executable file
0
app/models/regional_model.py
Normal file → Executable file
0
app/models/role_model.py
Normal file → Executable file
0
app/models/role_model.py
Normal file → Executable file
0
app/models/user_model.py
Normal file → Executable file
0
app/models/user_model.py
Normal file → Executable file
0
app/repositories/__init__.py
Normal file → Executable file
0
app/repositories/__init__.py
Normal file → Executable file
0
app/repositories/base.py
Normal file → Executable file
0
app/repositories/base.py
Normal file → Executable file
0
app/repositories/category_repository.py
Normal file → Executable file
0
app/repositories/category_repository.py
Normal file → Executable file
0
app/repositories/classification_repository.py
Normal file → Executable file
0
app/repositories/classification_repository.py
Normal file → Executable file
0
app/repositories/credential_repository.py
Normal file → Executable file
0
app/repositories/credential_repository.py
Normal file → Executable file
0
app/repositories/feedback_repository.py
Normal file → Executable file
0
app/repositories/feedback_repository.py
Normal file → Executable file
0
app/repositories/file_repository.py
Normal file → Executable file
0
app/repositories/file_repository.py
Normal file → Executable file
0
app/repositories/map_access_repository.py
Normal file → Executable file
0
app/repositories/map_access_repository.py
Normal file → Executable file
0
app/repositories/map_projection_system_repository.py
Normal file → Executable file
0
app/repositories/map_projection_system_repository.py
Normal file → Executable file
0
app/repositories/map_source_repository.py
Normal file → Executable file
0
app/repositories/map_source_repository.py
Normal file → Executable file
0
app/repositories/map_source_usage_repository.py
Normal file → Executable file
0
app/repositories/map_source_usage_repository.py
Normal file → Executable file
0
app/repositories/mapset_history_repository.py
Normal file → Executable file
0
app/repositories/mapset_history_repository.py
Normal file → Executable file
0
app/repositories/mapset_repository.py
Normal file → Executable file
0
app/repositories/mapset_repository.py
Normal file → Executable file
0
app/repositories/news_repository.py
Normal file → Executable file
0
app/repositories/news_repository.py
Normal file → Executable file
0
app/repositories/organization_repository.py
Normal file → Executable file
0
app/repositories/organization_repository.py
Normal file → Executable file
0
app/repositories/regional_repository.py
Normal file → Executable file
0
app/repositories/regional_repository.py
Normal file → Executable file
0
app/repositories/role_repository.py
Normal file → Executable file
0
app/repositories/role_repository.py
Normal file → Executable file
0
app/repositories/token_repository.py
Normal file → Executable file
0
app/repositories/token_repository.py
Normal file → Executable file
0
app/repositories/user_repository.py
Normal file → Executable file
0
app/repositories/user_repository.py
Normal file → Executable file
0
app/response/res.py
Normal file → Executable file
0
app/response/res.py
Normal file → Executable file
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user