satupeta-main/app/mapset_pipeline/data/repository.py
2026-02-23 12:20:42 +07:00

260 lines
8.5 KiB
Python
Executable File

import os
import json
import asyncio
import pandas as pd
from shapely import wkt, wkb
from shapely.geometry import MultiPolygon, MultiLineString
from sqlalchemy import text
from sqlalchemy.exc import SQLAlchemyError
# Import koneksi database Anda
from database.connection import engine
from app.mapset_pipeline.utils.formatters import str_to_date
async def generate_unique_table_name(base_name: str) -> str:
"""Generate nama tabel unik, menambahkan suffix angka jika sudah ada."""
base_name = base_name.lower().replace(" ", "_").replace("-", "_")
table_name = base_name
counter = 2
async with engine.connect() as conn:
while True:
# Cek keberadaan tabel di schema public (atau default search path)
result = await conn.execute(
text("SELECT to_regclass(:tname)"),
{"tname": table_name}
)
exists = result.scalar()
if not exists:
return table_name
table_name = f"{base_name}_{counter}"
counter += 1
async def insert_parquet_to_postgis(filename: str, table_name: str):
"""
Membaca file parquet sementara, membersihkan data, dan melakukan COPY
ke PostGIS menggunakan asyncpg pool untuk performa tinggi.
"""
from main import db_pool
file_path = os.path.join("tmp", filename)
if not os.path.exists(file_path):
raise FileNotFoundError(f"File temp {file_path} tidak ditemukan")
try:
loop = asyncio.get_running_loop()
# Baca parquet (CPU bound, run in executor jika file sangat besar)
df = await loop.run_in_executor(None, pd.read_parquet, file_path)
# 1. CLEANING NAMA KOLOM
df.columns = [str(col).strip().upper() for col in df.columns]
# Standarisasi kolom GEOM
if "GEOM" in df.columns:
df.rename(columns={"GEOM": "GEOM"}, inplace=True)
if "GEOM" not in df.columns:
raise ValueError("Kolom GEOM tidak ditemukan dalam Parquet")
# 2. PREPARE DATA ROWS
clean_rows = []
geom_types = set()
# Atribut selain GEOM
attr_columns = [col for col in df.columns if col != "GEOM"]
for row in df.itertuples(index=False):
# --- Handle GEOM ---
raw_geom = getattr(row, "GEOM", None)
if not raw_geom: continue
try:
geom = None
if isinstance(raw_geom, str):
geom = wkt.loads(raw_geom)
elif isinstance(raw_geom, bytes):
geom = wkb.loads(raw_geom)
if not geom: continue
# Fix Invalid Geometry
if not geom.is_valid:
geom = geom.buffer(0)
# Force Multi-Geometry agar seragam
gtype = geom.geom_type.upper()
if gtype == "POLYGON": geom = MultiPolygon([geom])
elif gtype == "LINESTRING": geom = MultiLineString([geom])
geom_types.add(geom.geom_type)
# Convert ke EWKT (SRID 4326)
ewkt = f"SRID=4326;{geom.wkt}"
except Exception:
continue # Skip baris dengan geom rusak
# --- Handle Attributes (FORCE STRING) ---
row_data = []
for col in attr_columns:
val = getattr(row, col, None)
if val is not None:
row_data.append(str(val))
else:
row_data.append(None)
row_data.append(ewkt)
clean_rows.append(tuple(row_data))
if not clean_rows:
raise ValueError("Data valid kosong setelah pemrosesan geometry")
# 3. DATABASE OPERATIONS
final_geom_type = list(geom_types)[0].upper() if geom_types else "GEOM"
if "MULTI" not in final_geom_type and final_geom_type != "GEOM":
final_geom_type = "MULTI" + final_geom_type
# A. CREATE TABLE
col_defs = [f'"{col}" TEXT' for col in attr_columns] # Semua atribut jadi TEXT dulu agar aman
create_sql = f"""
CREATE TABLE {table_name} (
_id SERIAL PRIMARY KEY,
{', '.join(col_defs)},
geom TEXT
);
"""
async with db_pool.acquire() as conn:
# Create Table
await conn.execute(create_sql)
# B. COPY Data (Bulk Insert)
target_cols = attr_columns + ['geom']
# asyncpg otomatis meng-quote nama kolom
await conn.copy_records_to_table(
table_name,
records=clean_rows,
columns=target_cols
)
# C. ALTER COLUMN GEOMETRY & INDEX
alter_sql = f"""
ALTER TABLE {table_name}
ALTER COLUMN geom TYPE geometry({final_geom_type}, 4326)
USING ST_Force2D(geom::geometry)::geometry({final_geom_type}, 4326);
CREATE INDEX idx_{table_name}_geom ON {table_name} USING GIST (geom);
"""
await conn.execute(alter_sql)
print(f"[SUCCESS] Upload {len(clean_rows)} baris ke tabel {table_name}.")
# Hapus file temp setelah sukses
try:
os.remove(file_path)
except OSError:
pass
return {
"table_name": table_name,
"row_count": len(clean_rows),
"geom_type": final_geom_type
}
except Exception as e:
print(f"[ERROR] Processing parquet to DB: {e}")
raise e
async def save_author_metadata(payload_author: dict, table_name: str, dataset_title: str,
geom_types: list, row_count: int, user_id: int):
"""
Menyimpan metadata author dan informasi dataset ke tabel backend.author_metadata.
"""
query = text("""
INSERT INTO backend.author_metadata (
table_title,
dataset_title,
dataset_abstract,
keywords,
topic_category,
date_created,
dataset_status,
organization_name,
contact_person_name,
contact_email,
contact_phone,
geom_type,
user_id,
process,
geometry_count
) VALUES (
:table_title,
:dataset_title,
:dataset_abstract,
:keywords,
:topic_category,
:date_created,
:dataset_status,
:organization_name,
:contact_person_name,
:contact_email,
:contact_phone,
:geom_type,
:user_id,
:process,
:geometry_count
)
""")
params = {
"table_title": table_name,
"dataset_title": dataset_title,
"dataset_abstract": payload_author.get("abstract"),
"keywords": payload_author.get("keywords"),
"topic_category": ", ".join(payload_author.get("topicCategory", [])),
"date_created": str_to_date(payload_author.get("dateCreated")),
"dataset_status": payload_author.get("status"),
"organization_name": payload_author.get("organization"),
"contact_person_name": payload_author.get("contactName"),
"contact_email": payload_author.get("contactEmail"),
"contact_phone": payload_author.get("contactPhone"),
"geom_type": json.dumps(geom_types),
"user_id": user_id,
"process": 'CLEANSING',
"geometry_count": row_count
}
async with engine.begin() as conn:
await conn.execute(query, params)
async def call_cleansing_procedure(table_name: str):
"""
Menjalankan stored procedure cleansing geometry di database.
"""
try:
print(f"[INFO] Memulai cleansing database untuk tabel: {table_name}")
async with engine.begin() as conn:
# Menggunakan parameter binding yang aman
await conn.execute(
text("CALL pr_cleansing_satupeta_polygon(:table_name, NULL);"),
{"table_name": table_name}
)
print(f"[SUCCESS] Cleansing selesai untuk tabel: {table_name}")
return "done"
except SQLAlchemyError as e:
print(f"[ERROR] Cleansing database gagal: {e}")
# Kita raise error agar Service tahu kalau proses ini gagal
raise RuntimeError(f"Database cleansing failed: {str(e)}")