satupeta-main/app/mapset_pipeline/data/repository.py

260 lines
8.5 KiB
Python
Raw Normal View History

2026-02-23 05:20:42 +00:00
import os
import json
import asyncio
import pandas as pd
from shapely import wkt, wkb
from shapely.geometry import MultiPolygon, MultiLineString
from sqlalchemy import text
from sqlalchemy.exc import SQLAlchemyError
# Import koneksi database Anda
from database.connection import engine
from app.mapset_pipeline.utils.formatters import str_to_date
async def generate_unique_table_name(base_name: str) -> str:
"""Generate nama tabel unik, menambahkan suffix angka jika sudah ada."""
base_name = base_name.lower().replace(" ", "_").replace("-", "_")
table_name = base_name
counter = 2
async with engine.connect() as conn:
while True:
# Cek keberadaan tabel di schema public (atau default search path)
result = await conn.execute(
text("SELECT to_regclass(:tname)"),
{"tname": table_name}
)
exists = result.scalar()
if not exists:
return table_name
table_name = f"{base_name}_{counter}"
counter += 1
async def insert_parquet_to_postgis(filename: str, table_name: str):
"""
Membaca file parquet sementara, membersihkan data, dan melakukan COPY
ke PostGIS menggunakan asyncpg pool untuk performa tinggi.
"""
from main import db_pool
file_path = os.path.join("tmp", filename)
if not os.path.exists(file_path):
raise FileNotFoundError(f"File temp {file_path} tidak ditemukan")
try:
loop = asyncio.get_running_loop()
# Baca parquet (CPU bound, run in executor jika file sangat besar)
df = await loop.run_in_executor(None, pd.read_parquet, file_path)
# 1. CLEANING NAMA KOLOM
df.columns = [str(col).strip().upper() for col in df.columns]
# Standarisasi kolom GEOM
if "GEOM" in df.columns:
df.rename(columns={"GEOM": "GEOM"}, inplace=True)
if "GEOM" not in df.columns:
raise ValueError("Kolom GEOM tidak ditemukan dalam Parquet")
# 2. PREPARE DATA ROWS
clean_rows = []
geom_types = set()
# Atribut selain GEOM
attr_columns = [col for col in df.columns if col != "GEOM"]
for row in df.itertuples(index=False):
# --- Handle GEOM ---
raw_geom = getattr(row, "GEOM", None)
if not raw_geom: continue
try:
geom = None
if isinstance(raw_geom, str):
geom = wkt.loads(raw_geom)
elif isinstance(raw_geom, bytes):
geom = wkb.loads(raw_geom)
if not geom: continue
# Fix Invalid Geometry
if not geom.is_valid:
geom = geom.buffer(0)
# Force Multi-Geometry agar seragam
gtype = geom.geom_type.upper()
if gtype == "POLYGON": geom = MultiPolygon([geom])
elif gtype == "LINESTRING": geom = MultiLineString([geom])
geom_types.add(geom.geom_type)
# Convert ke EWKT (SRID 4326)
ewkt = f"SRID=4326;{geom.wkt}"
except Exception:
continue # Skip baris dengan geom rusak
# --- Handle Attributes (FORCE STRING) ---
row_data = []
for col in attr_columns:
val = getattr(row, col, None)
if val is not None:
row_data.append(str(val))
else:
row_data.append(None)
row_data.append(ewkt)
clean_rows.append(tuple(row_data))
if not clean_rows:
raise ValueError("Data valid kosong setelah pemrosesan geometry")
# 3. DATABASE OPERATIONS
final_geom_type = list(geom_types)[0].upper() if geom_types else "GEOM"
if "MULTI" not in final_geom_type and final_geom_type != "GEOM":
final_geom_type = "MULTI" + final_geom_type
# A. CREATE TABLE
col_defs = [f'"{col}" TEXT' for col in attr_columns] # Semua atribut jadi TEXT dulu agar aman
create_sql = f"""
CREATE TABLE {table_name} (
_id SERIAL PRIMARY KEY,
{', '.join(col_defs)},
geom TEXT
);
"""
async with db_pool.acquire() as conn:
# Create Table
await conn.execute(create_sql)
# B. COPY Data (Bulk Insert)
target_cols = attr_columns + ['geom']
# asyncpg otomatis meng-quote nama kolom
await conn.copy_records_to_table(
table_name,
records=clean_rows,
columns=target_cols
)
# C. ALTER COLUMN GEOMETRY & INDEX
alter_sql = f"""
ALTER TABLE {table_name}
ALTER COLUMN geom TYPE geometry({final_geom_type}, 4326)
USING ST_Force2D(geom::geometry)::geometry({final_geom_type}, 4326);
CREATE INDEX idx_{table_name}_geom ON {table_name} USING GIST (geom);
"""
await conn.execute(alter_sql)
print(f"[SUCCESS] Upload {len(clean_rows)} baris ke tabel {table_name}.")
# Hapus file temp setelah sukses
try:
os.remove(file_path)
except OSError:
pass
return {
"table_name": table_name,
"row_count": len(clean_rows),
"geom_type": final_geom_type
}
except Exception as e:
print(f"[ERROR] Processing parquet to DB: {e}")
raise e
async def save_author_metadata(payload_author: dict, table_name: str, dataset_title: str,
geom_types: list, row_count: int, user_id: int):
"""
Menyimpan metadata author dan informasi dataset ke tabel backend.author_metadata.
"""
query = text("""
INSERT INTO backend.author_metadata (
table_title,
dataset_title,
dataset_abstract,
keywords,
topic_category,
date_created,
dataset_status,
organization_name,
contact_person_name,
contact_email,
contact_phone,
geom_type,
user_id,
process,
geometry_count
) VALUES (
:table_title,
:dataset_title,
:dataset_abstract,
:keywords,
:topic_category,
:date_created,
:dataset_status,
:organization_name,
:contact_person_name,
:contact_email,
:contact_phone,
:geom_type,
:user_id,
:process,
:geometry_count
)
""")
params = {
"table_title": table_name,
"dataset_title": dataset_title,
"dataset_abstract": payload_author.get("abstract"),
"keywords": payload_author.get("keywords"),
"topic_category": ", ".join(payload_author.get("topicCategory", [])),
"date_created": str_to_date(payload_author.get("dateCreated")),
"dataset_status": payload_author.get("status"),
"organization_name": payload_author.get("organization"),
"contact_person_name": payload_author.get("contactName"),
"contact_email": payload_author.get("contactEmail"),
"contact_phone": payload_author.get("contactPhone"),
"geom_type": json.dumps(geom_types),
"user_id": user_id,
"process": 'CLEANSING',
"geometry_count": row_count
}
async with engine.begin() as conn:
await conn.execute(query, params)
async def call_cleansing_procedure(table_name: str):
"""
Menjalankan stored procedure cleansing geometry di database.
"""
try:
print(f"[INFO] Memulai cleansing database untuk tabel: {table_name}")
async with engine.begin() as conn:
# Menggunakan parameter binding yang aman
await conn.execute(
text("CALL pr_cleansing_satupeta_polygon(:table_name, NULL);"),
{"table_name": table_name}
)
print(f"[SUCCESS] Cleansing selesai untuk tabel: {table_name}")
return "done"
except SQLAlchemyError as e:
print(f"[ERROR] Cleansing database gagal: {e}")
# Kita raise error agar Service tahu kalau proses ini gagal
raise RuntimeError(f"Database cleansing failed: {str(e)}")