260 lines
8.5 KiB
Python
Executable File
260 lines
8.5 KiB
Python
Executable File
import os
|
|
import json
|
|
import asyncio
|
|
import pandas as pd
|
|
from shapely import wkt, wkb
|
|
from shapely.geometry import MultiPolygon, MultiLineString
|
|
from sqlalchemy import text
|
|
from sqlalchemy.exc import SQLAlchemyError
|
|
|
|
# Import koneksi database Anda
|
|
from database.connection import engine
|
|
from app.mapset_pipeline.utils.formatters import str_to_date
|
|
|
|
|
|
async def generate_unique_table_name(base_name: str) -> str:
|
|
"""Generate nama tabel unik, menambahkan suffix angka jika sudah ada."""
|
|
base_name = base_name.lower().replace(" ", "_").replace("-", "_")
|
|
table_name = base_name
|
|
counter = 2
|
|
|
|
async with engine.connect() as conn:
|
|
while True:
|
|
# Cek keberadaan tabel di schema public (atau default search path)
|
|
result = await conn.execute(
|
|
text("SELECT to_regclass(:tname)"),
|
|
{"tname": table_name}
|
|
)
|
|
exists = result.scalar()
|
|
|
|
if not exists:
|
|
return table_name
|
|
|
|
table_name = f"{base_name}_{counter}"
|
|
counter += 1
|
|
|
|
|
|
async def insert_parquet_to_postgis(filename: str, table_name: str):
|
|
"""
|
|
Membaca file parquet sementara, membersihkan data, dan melakukan COPY
|
|
ke PostGIS menggunakan asyncpg pool untuk performa tinggi.
|
|
"""
|
|
from main import db_pool
|
|
file_path = os.path.join("tmp", filename)
|
|
|
|
if not os.path.exists(file_path):
|
|
raise FileNotFoundError(f"File temp {file_path} tidak ditemukan")
|
|
|
|
try:
|
|
loop = asyncio.get_running_loop()
|
|
# Baca parquet (CPU bound, run in executor jika file sangat besar)
|
|
df = await loop.run_in_executor(None, pd.read_parquet, file_path)
|
|
|
|
# 1. CLEANING NAMA KOLOM
|
|
df.columns = [str(col).strip().upper() for col in df.columns]
|
|
|
|
# Standarisasi kolom GEOM
|
|
if "GEOM" in df.columns:
|
|
df.rename(columns={"GEOM": "GEOM"}, inplace=True)
|
|
|
|
if "GEOM" not in df.columns:
|
|
raise ValueError("Kolom GEOM tidak ditemukan dalam Parquet")
|
|
|
|
# 2. PREPARE DATA ROWS
|
|
clean_rows = []
|
|
geom_types = set()
|
|
|
|
# Atribut selain GEOM
|
|
attr_columns = [col for col in df.columns if col != "GEOM"]
|
|
|
|
for row in df.itertuples(index=False):
|
|
# --- Handle GEOM ---
|
|
raw_geom = getattr(row, "GEOM", None)
|
|
if not raw_geom: continue
|
|
|
|
try:
|
|
geom = None
|
|
if isinstance(raw_geom, str):
|
|
geom = wkt.loads(raw_geom)
|
|
elif isinstance(raw_geom, bytes):
|
|
geom = wkb.loads(raw_geom)
|
|
|
|
if not geom: continue
|
|
|
|
# Fix Invalid Geometry
|
|
if not geom.is_valid:
|
|
geom = geom.buffer(0)
|
|
|
|
# Force Multi-Geometry agar seragam
|
|
gtype = geom.geom_type.upper()
|
|
if gtype == "POLYGON": geom = MultiPolygon([geom])
|
|
elif gtype == "LINESTRING": geom = MultiLineString([geom])
|
|
|
|
geom_types.add(geom.geom_type)
|
|
|
|
# Convert ke EWKT (SRID 4326)
|
|
ewkt = f"SRID=4326;{geom.wkt}"
|
|
|
|
except Exception:
|
|
continue # Skip baris dengan geom rusak
|
|
|
|
# --- Handle Attributes (FORCE STRING) ---
|
|
row_data = []
|
|
for col in attr_columns:
|
|
val = getattr(row, col, None)
|
|
if val is not None:
|
|
row_data.append(str(val))
|
|
else:
|
|
row_data.append(None)
|
|
|
|
row_data.append(ewkt)
|
|
clean_rows.append(tuple(row_data))
|
|
|
|
if not clean_rows:
|
|
raise ValueError("Data valid kosong setelah pemrosesan geometry")
|
|
|
|
# 3. DATABASE OPERATIONS
|
|
final_geom_type = list(geom_types)[0].upper() if geom_types else "GEOM"
|
|
if "MULTI" not in final_geom_type and final_geom_type != "GEOM":
|
|
final_geom_type = "MULTI" + final_geom_type
|
|
|
|
# A. CREATE TABLE
|
|
col_defs = [f'"{col}" TEXT' for col in attr_columns] # Semua atribut jadi TEXT dulu agar aman
|
|
|
|
create_sql = f"""
|
|
CREATE TABLE {table_name} (
|
|
_id SERIAL PRIMARY KEY,
|
|
{', '.join(col_defs)},
|
|
geom TEXT
|
|
);
|
|
"""
|
|
|
|
async with db_pool.acquire() as conn:
|
|
# Create Table
|
|
await conn.execute(create_sql)
|
|
|
|
# B. COPY Data (Bulk Insert)
|
|
target_cols = attr_columns + ['geom']
|
|
# asyncpg otomatis meng-quote nama kolom
|
|
await conn.copy_records_to_table(
|
|
table_name,
|
|
records=clean_rows,
|
|
columns=target_cols
|
|
)
|
|
|
|
# C. ALTER COLUMN GEOMETRY & INDEX
|
|
alter_sql = f"""
|
|
ALTER TABLE {table_name}
|
|
ALTER COLUMN geom TYPE geometry({final_geom_type}, 4326)
|
|
USING ST_Force2D(geom::geometry)::geometry({final_geom_type}, 4326);
|
|
|
|
CREATE INDEX idx_{table_name}_geom ON {table_name} USING GIST (geom);
|
|
"""
|
|
await conn.execute(alter_sql)
|
|
|
|
print(f"[SUCCESS] Upload {len(clean_rows)} baris ke tabel {table_name}.")
|
|
|
|
# Hapus file temp setelah sukses
|
|
try:
|
|
os.remove(file_path)
|
|
except OSError:
|
|
pass
|
|
|
|
return {
|
|
"table_name": table_name,
|
|
"row_count": len(clean_rows),
|
|
"geom_type": final_geom_type
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"[ERROR] Processing parquet to DB: {e}")
|
|
raise e
|
|
|
|
|
|
async def save_author_metadata(payload_author: dict, table_name: str, dataset_title: str,
|
|
geom_types: list, row_count: int, user_id: int):
|
|
"""
|
|
Menyimpan metadata author dan informasi dataset ke tabel backend.author_metadata.
|
|
"""
|
|
query = text("""
|
|
INSERT INTO backend.author_metadata (
|
|
table_title,
|
|
dataset_title,
|
|
dataset_abstract,
|
|
keywords,
|
|
topic_category,
|
|
date_created,
|
|
dataset_status,
|
|
organization_name,
|
|
contact_person_name,
|
|
contact_email,
|
|
contact_phone,
|
|
geom_type,
|
|
user_id,
|
|
process,
|
|
geometry_count
|
|
) VALUES (
|
|
:table_title,
|
|
:dataset_title,
|
|
:dataset_abstract,
|
|
:keywords,
|
|
:topic_category,
|
|
:date_created,
|
|
:dataset_status,
|
|
:organization_name,
|
|
:contact_person_name,
|
|
:contact_email,
|
|
:contact_phone,
|
|
:geom_type,
|
|
:user_id,
|
|
:process,
|
|
:geometry_count
|
|
)
|
|
""")
|
|
|
|
params = {
|
|
"table_title": table_name,
|
|
"dataset_title": dataset_title,
|
|
"dataset_abstract": payload_author.get("abstract"),
|
|
"keywords": payload_author.get("keywords"),
|
|
"topic_category": ", ".join(payload_author.get("topicCategory", [])),
|
|
"date_created": str_to_date(payload_author.get("dateCreated")),
|
|
"dataset_status": payload_author.get("status"),
|
|
"organization_name": payload_author.get("organization"),
|
|
"contact_person_name": payload_author.get("contactName"),
|
|
"contact_email": payload_author.get("contactEmail"),
|
|
"contact_phone": payload_author.get("contactPhone"),
|
|
"geom_type": json.dumps(geom_types),
|
|
"user_id": user_id,
|
|
"process": 'CLEANSING',
|
|
"geometry_count": row_count
|
|
}
|
|
|
|
async with engine.begin() as conn:
|
|
await conn.execute(query, params)
|
|
|
|
|
|
async def call_cleansing_procedure(table_name: str):
|
|
"""
|
|
Menjalankan stored procedure cleansing geometry di database.
|
|
"""
|
|
try:
|
|
print(f"[INFO] Memulai cleansing database untuk tabel: {table_name}")
|
|
|
|
async with engine.begin() as conn:
|
|
# Menggunakan parameter binding yang aman
|
|
await conn.execute(
|
|
text("CALL pr_cleansing_satupeta_polygon(:table_name, NULL);"),
|
|
{"table_name": table_name}
|
|
)
|
|
|
|
print(f"[SUCCESS] Cleansing selesai untuk tabel: {table_name}")
|
|
return "done"
|
|
|
|
except SQLAlchemyError as e:
|
|
print(f"[ERROR] Cleansing database gagal: {e}")
|
|
# Kita raise error agar Service tahu kalau proses ini gagal
|
|
raise RuntimeError(f"Database cleansing failed: {str(e)}")
|
|
|
|
|