file_table_reader/services/upload_file/upload.py
2025-11-24 08:57:43 +07:00

432 lines
13 KiB
Python

import json
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import re
import zipfile
from shapely.geometry.base import BaseGeometry
from shapely.geometry import base as shapely_base
from fastapi import File, Form, UploadFile, HTTPException
from core.config import UPLOAD_FOLDER, MAX_FILE_MB, VALID_WKT_PREFIXES
from services.upload_file.readers.reader_csv import read_csv
from services.upload_file.readers.reader_shp import read_shp
from services.upload_file.readers.reader_gdb import read_gdb
from services.upload_file.readers.reader_mpk import read_mpk
from services.upload_file.readers.reader_pdf import convert_df, read_pdf
from services.upload_file.utils.geometry_detector import detect_and_build_geometry
from services.upload_file.utils.geometry_detector import attach_polygon_geometry_auto
from database.connection import engine, sync_engine
from database.models import Base
from pydantic import BaseModel
from typing import Any, Dict, List, Optional
from shapely import wkt
from sqlalchemy import text
from datetime import datetime
from response import successRes, errorRes
from utils.logger_config import log_activity
# Base.metadata.create_all(bind=engine)
def is_geom_empty(g):
if g is None:
return True
if isinstance(g, float) and pd.isna(g):
return True
if isinstance(g, BaseGeometry):
return g.is_empty
return False
def safe_json(value):
"""Konversi aman untuk semua tipe numpy/pandas/shapely ke tipe JSON-serializable"""
if isinstance(value, (np.int64, np.int32)):
return int(value)
if isinstance(value, (np.float64, np.float32)):
return float(value)
if isinstance(value, pd.Timestamp):
return value.isoformat()
if isinstance(value, shapely_base.BaseGeometry):
return str(value) # convert to WKT string
if pd.isna(value):
return None
return value
def detect_zip_type(zip_path: str) -> str:
with zipfile.ZipFile(zip_path, "r") as zip_ref:
files = zip_ref.namelist()
if any(f.lower().endswith(".gdb/") or ".gdb/" in f.lower() for f in files):
return "gdb"
if any(f.lower().endswith(ext) for ext in [".gdbtable", ".gdbtablx", ".gdbindexes", ".spx"] for f in files):
return "gdb"
if any(f.lower().endswith(".shp") for f in files):
return "shp"
return "unknown"
def process_data(df: pd.DataFrame, ext: str):
result = detect_and_build_geometry(df, master_polygons=None)
if not hasattr(result, "geometry") or result.geometry.isna().all():
result = attach_polygon_geometry_auto(result)
if isinstance(result, gpd.GeoDataFrame) and "geometry" in result.columns:
geom_type = ", ".join([g for g in result.geometry.geom_type.unique() if g]) \
if not result.empty else "None"
null_geom = result.geometry.isna().sum()
print(f"[INFO] Tipe Geometry: {geom_type}")
print(f"[INFO] Jumlah geometry kosong: {null_geom}")
else:
res = {
"message": "Tidak menemukan tabel yang relevan.",
"file_type": ext,
"rows": 0,
"columns": 0,
"geometry_valid": 0,
"geometry_empty": 0,
"geometry_valid_percent": 0,
"warnings": [],
"warning_examples": [],
"preview": []
}
return errorRes(message="Tidak berhasil mencocokan geometry pada tabel." ,details=res, status_code=422)
result = result.replace([pd.NA, float('inf'), float('-inf')], None)
if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns:
result['geometry'] = result['geometry'].apply(
lambda g: g.wkt if g is not None else None
)
empty_count = result['geometry'].apply(is_geom_empty).sum()
valid_count = len(result) - empty_count
match_percentage = (valid_count / len(result)) * 100
warnings = []
if empty_count > 0:
warnings.append(
f"{empty_count} dari {len(result)} baris tidak memiliki geometry yang valid "
f"({100 - match_percentage:.2f}% data gagal cocok)."
)
if empty_count > 0:
examples = result[result['geometry'].apply(is_geom_empty)].head(500)
warning_examples = examples.to_dict(orient="records")
else:
warning_examples = []
preview_data = result.head(15).to_dict(orient="records")
# preview_data = result.to_dict(orient="records")
preview_safe = [
{k: safe_json(v) for k, v in row.items()} for row in preview_data
]
warning_safe = [
{k: safe_json(v) for k, v in row.items()} for row in warning_examples
]
response = {
"message": "File berhasil dibaca dan dianalisis.",
"file_type": ext,
"rows": int(len(result)),
"columns": list(map(str, result.columns)),
"geometry_valid": int(valid_count),
"geometry_empty": int(empty_count),
"geometry_valid_percent": float(round(match_percentage, 2)),
"warnings": warnings,
"warning_examples": warning_safe,
"preview": preview_safe
}
# return successRes(content=response)
return response
async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form("")):
fname = file.filename
ext = os.path.splitext(fname)[1].lower()
contents = await file.read()
size_mb = len(contents) / (1024*1024)
if size_mb > MAX_FILE_MB:
raise errorRes(status_code=413, message="Ukuran File Terlalu Besar")
tmp_path = UPLOAD_FOLDER / fname
with open(tmp_path, "wb") as f:
f.write(contents)
try:
df = None
print('ext', ext)
if ext == ".csv":
df = read_csv(str(tmp_path))
elif ext == ".xlsx":
df = read_csv(str(tmp_path), sheet)
elif ext == ".mpk":
df = read_mpk(str(tmp_path))
elif ext == ".pdf":
tbl = read_pdf(tmp_path, page)
if len(tbl) == 0:
res = {
"message": "Tidak ditemukan tabel valid",
"tables": {},
"file_type": ext
}
return successRes(message="Tidak ditemukan tabel valid", data=res)
elif len(tbl) > 1:
res = {
"message": "File berhasil dibaca dan dianalisis.",
"tables": tbl,
"file_type": ext
}
return successRes(data=res, message="File berhasil dibaca dan dianalisis.")
else:
df = convert_df(tbl[0])
elif ext == ".zip":
zip_type = detect_zip_type(str(tmp_path))
if zip_type == "shp":
print("[INFO] ZIP terdeteksi sebagai Shapefile.")
df = read_shp(str(tmp_path))
elif zip_type == "gdb":
print("[INFO] ZIP terdeteksi sebagai Geodatabase (GDB).")
df = read_gdb(str(tmp_path))
else:
raise errorRes(
status_code=400,
message="ZIP file tidak mengandung SHP atau GDB yang valid."
)
else:
raise errorRes(status_code=400, message="Unsupported file type")
if df is None or (hasattr(df, "empty") and df.empty):
return successRes(message="File berhasil dibaca, Tetapi tidak ditemukan tabel valid")
res = process_data(df, ext)
tmp_path.unlink(missing_ok=True)
return successRes(data=res)
except Exception as e:
print(f"[ERROR] {e}")
return errorRes(
message="Internal Server Error",
details=str(e),
status_code=500
)
# finally:
# db_session.close()
class PdfRequest(BaseModel):
title: str
columns: List[str]
rows: List[List]
async def handle_process_pdf(payload: PdfRequest):
try:
df = convert_df(payload.model_dump())
if df is None or (hasattr(df, "empty") and df.empty):
return errorRes(message="Tidak ada tabel")
res = process_data(df, '.pdf')
return successRes(data=res)
except Exception as e:
print(f"[ERROR] {e}")
return errorRes(message="Internal Server Error", details= str(e), status_code=500)
# finally:
# db_session.close()
class UploadRequest(BaseModel):
title: str
rows: List[dict]
columns: List[str]
author: Dict[str, Any]
async def generate_unique_table_name(base_name: str):
base_name = base_name.lower().replace(" ", "_").replace("-", "_")
table_name = base_name
counter = 2
async with engine.connect() as conn:
while True:
result = await conn.execute(
text("SELECT to_regclass(:tname)"),
{"tname": table_name}
)
exists = result.scalar()
if not exists:
return table_name
table_name = f"{base_name}_{counter}"
counter += 1
def str_to_date(raw_date: str):
if raw_date:
try:
return datetime.strptime(raw_date, "%Y-%m-%d").date()
except Exception as e:
print("[WARNING] Tidak bisa parse dateCreated:", e)
return None
import asyncio
async def handle_to_postgis(payload: UploadRequest, user_id: int = 2):
try:
table_name = await generate_unique_table_name(payload.title)
df = pd.DataFrame(payload.rows)
df.columns = [col.upper() for col in df.columns]
if "GEOMETRY" not in df.columns:
raise HTTPException(400, "Kolom GEOMETRY tidak ditemukan")
df["GEOMETRY"] = df["GEOMETRY"].apply(
lambda g: wkt.loads(g)
if isinstance(g, str) else None
)
gdf = gpd.GeoDataFrame(df, geometry="GEOMETRY", crs="EPSG:4326")
# --- Wajib: gunakan engine sync TANPA asyncpg
loop = asyncio.get_running_loop()
await loop.run_in_executor(
None,
lambda: gdf.to_postgis(
table_name,
sync_engine, # JANGAN ENGINE ASYNC
if_exists="replace",
index=False
)
)
# === STEP 4: add ID column ===
async with engine.begin() as conn:
await conn.execute(text(
f'ALTER TABLE "{table_name}" ADD COLUMN _ID SERIAL PRIMARY KEY;'
))
# === STEP 5: save author metadata ===
author = payload.author
async with engine.begin() as conn:
await conn.execute(text("""
INSERT INTO backend.author_metadata (
table_title,
dataset_title,
dataset_abstract,
keywords,
topic_category,
date_created,
dataset_status,
organization_name,
contact_person_name,
contact_email,
contact_phone,
geom_type,
user_id
) VALUES (
:table_title,
:dataset_title,
:dataset_abstract,
:keywords,
:topic_category,
:date_created,
:dataset_status,
:organization_name,
:contact_person_name,
:contact_email,
:contact_phone,
:geom_type,
:user_id
)
"""), {
"table_title": table_name,
"dataset_title": author.get("title") or payload.title,
"dataset_abstract": author.get("abstract"),
"keywords": author.get("keywords"),
"topic_category": author.get("topicCategory"),
"date_created": str_to_date(author.get("dateCreated")),
"dataset_status": author.get("status"),
"organization_name": author.get("organization"),
"contact_person_name": author.get("contactName"),
"contact_email": author.get("contactEmail"),
"contact_phone": author.get("contactPhone"),
"geom_type": json.dumps(list(gdf.geom_type.unique())),
"user_id": user_id
})
# === STEP 6: log success ===
await log_activity(
user_id=user_id,
action_type="UPLOAD",
action_title=f"Upload dataset {table_name}",
details={"table_name": table_name, "rows": len(gdf)}
)
res = {
"table_name": table_name,
"status": "success",
"message": f"Tabel '{table_name}' berhasil dibuat.",
"total_rows": len(gdf),
"geometry_type": list(gdf.geom_type.unique()),
}
return successRes(data=res)
except Exception as e:
await log_activity(
user_id=user_id,
action_type="ERROR",
action_title="Upload gagal",
details={"error": str(e)}
)
print(f"error : {str(e)}")
raise HTTPException(status_code=500, detail=str(e))