file_table_reader/services/upload_file/upload.py

432 lines
13 KiB
Python
Raw Normal View History

2025-11-17 03:53:15 +00:00
import json
2025-11-06 07:23:24 +00:00
import os
import pandas as pd
import geopandas as gpd
import numpy as np
2025-11-17 03:53:15 +00:00
import re
2025-11-06 07:23:24 +00:00
import zipfile
from shapely.geometry.base import BaseGeometry
from shapely.geometry import base as shapely_base
from fastapi import File, Form, UploadFile, HTTPException
from core.config import UPLOAD_FOLDER, MAX_FILE_MB, VALID_WKT_PREFIXES
2025-11-17 03:53:15 +00:00
from services.upload_file.readers.reader_csv import read_csv
from services.upload_file.readers.reader_shp import read_shp
from services.upload_file.readers.reader_gdb import read_gdb
from services.upload_file.readers.reader_mpk import read_mpk
from services.upload_file.readers.reader_pdf import convert_df, read_pdf
from services.upload_file.utils.geometry_detector import detect_and_build_geometry
from services.upload_file.utils.geometry_detector import attach_polygon_geometry_auto
2025-11-24 01:57:43 +00:00
from database.connection import engine, sync_engine
2025-11-06 07:23:24 +00:00
from database.models import Base
from pydantic import BaseModel
2025-11-24 01:57:43 +00:00
from typing import Any, Dict, List, Optional
2025-11-06 07:23:24 +00:00
from shapely import wkt
from sqlalchemy import text
2025-11-17 03:53:15 +00:00
from datetime import datetime
from response import successRes, errorRes
2025-11-24 01:57:43 +00:00
from utils.logger_config import log_activity
2025-11-17 03:53:15 +00:00
# Base.metadata.create_all(bind=engine)
2025-11-06 07:23:24 +00:00
def is_geom_empty(g):
if g is None:
return True
if isinstance(g, float) and pd.isna(g):
return True
if isinstance(g, BaseGeometry):
return g.is_empty
return False
def safe_json(value):
"""Konversi aman untuk semua tipe numpy/pandas/shapely ke tipe JSON-serializable"""
if isinstance(value, (np.int64, np.int32)):
return int(value)
if isinstance(value, (np.float64, np.float32)):
return float(value)
if isinstance(value, pd.Timestamp):
return value.isoformat()
if isinstance(value, shapely_base.BaseGeometry):
return str(value) # convert to WKT string
if pd.isna(value):
return None
return value
def detect_zip_type(zip_path: str) -> str:
with zipfile.ZipFile(zip_path, "r") as zip_ref:
files = zip_ref.namelist()
if any(f.lower().endswith(".gdb/") or ".gdb/" in f.lower() for f in files):
return "gdb"
if any(f.lower().endswith(ext) for ext in [".gdbtable", ".gdbtablx", ".gdbindexes", ".spx"] for f in files):
return "gdb"
if any(f.lower().endswith(".shp") for f in files):
return "shp"
return "unknown"
def process_data(df: pd.DataFrame, ext: str):
result = detect_and_build_geometry(df, master_polygons=None)
if not hasattr(result, "geometry") or result.geometry.isna().all():
result = attach_polygon_geometry_auto(result)
if isinstance(result, gpd.GeoDataFrame) and "geometry" in result.columns:
geom_type = ", ".join([g for g in result.geometry.geom_type.unique() if g]) \
if not result.empty else "None"
null_geom = result.geometry.isna().sum()
print(f"[INFO] Tipe Geometry: {geom_type}")
print(f"[INFO] Jumlah geometry kosong: {null_geom}")
else:
2025-11-17 03:53:15 +00:00
res = {
2025-11-06 07:23:24 +00:00
"message": "Tidak menemukan tabel yang relevan.",
"file_type": ext,
"rows": 0,
"columns": 0,
"geometry_valid": 0,
"geometry_empty": 0,
"geometry_valid_percent": 0,
"warnings": [],
"warning_examples": [],
"preview": []
}
2025-11-17 03:53:15 +00:00
return errorRes(message="Tidak berhasil mencocokan geometry pada tabel." ,details=res, status_code=422)
2025-11-06 07:23:24 +00:00
result = result.replace([pd.NA, float('inf'), float('-inf')], None)
if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns:
result['geometry'] = result['geometry'].apply(
lambda g: g.wkt if g is not None else None
)
empty_count = result['geometry'].apply(is_geom_empty).sum()
valid_count = len(result) - empty_count
match_percentage = (valid_count / len(result)) * 100
warnings = []
if empty_count > 0:
warnings.append(
f"{empty_count} dari {len(result)} baris tidak memiliki geometry yang valid "
f"({100 - match_percentage:.2f}% data gagal cocok)."
)
if empty_count > 0:
examples = result[result['geometry'].apply(is_geom_empty)].head(500)
warning_examples = examples.to_dict(orient="records")
else:
warning_examples = []
2025-11-24 01:57:43 +00:00
preview_data = result.head(15).to_dict(orient="records")
2025-11-17 03:53:15 +00:00
# preview_data = result.to_dict(orient="records")
2025-11-06 07:23:24 +00:00
preview_safe = [
{k: safe_json(v) for k, v in row.items()} for row in preview_data
]
warning_safe = [
{k: safe_json(v) for k, v in row.items()} for row in warning_examples
]
response = {
"message": "File berhasil dibaca dan dianalisis.",
"file_type": ext,
"rows": int(len(result)),
"columns": list(map(str, result.columns)),
"geometry_valid": int(valid_count),
"geometry_empty": int(empty_count),
"geometry_valid_percent": float(round(match_percentage, 2)),
"warnings": warnings,
"warning_examples": warning_safe,
"preview": preview_safe
}
2025-11-17 03:53:15 +00:00
# return successRes(content=response)
2025-11-06 07:23:24 +00:00
return response
async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form("")):
fname = file.filename
ext = os.path.splitext(fname)[1].lower()
contents = await file.read()
size_mb = len(contents) / (1024*1024)
if size_mb > MAX_FILE_MB:
2025-11-17 03:53:15 +00:00
raise errorRes(status_code=413, message="Ukuran File Terlalu Besar")
2025-11-06 07:23:24 +00:00
tmp_path = UPLOAD_FOLDER / fname
with open(tmp_path, "wb") as f:
f.write(contents)
try:
df = None
print('ext', ext)
if ext == ".csv":
df = read_csv(str(tmp_path))
elif ext == ".xlsx":
df = read_csv(str(tmp_path), sheet)
2025-11-08 09:07:58 +00:00
elif ext == ".mpk":
df = read_mpk(str(tmp_path))
2025-11-06 07:23:24 +00:00
elif ext == ".pdf":
tbl = read_pdf(tmp_path, page)
if len(tbl) == 0:
2025-11-17 03:53:15 +00:00
res = {
2025-11-06 07:23:24 +00:00
"message": "Tidak ditemukan tabel valid",
2025-11-17 03:53:15 +00:00
"tables": {},
2025-11-06 07:23:24 +00:00
"file_type": ext
}
2025-11-17 03:53:15 +00:00
return successRes(message="Tidak ditemukan tabel valid", data=res)
2025-11-06 07:23:24 +00:00
elif len(tbl) > 1:
2025-11-17 03:53:15 +00:00
res = {
2025-11-06 07:23:24 +00:00
"message": "File berhasil dibaca dan dianalisis.",
"tables": tbl,
"file_type": ext
}
2025-11-17 03:53:15 +00:00
return successRes(data=res, message="File berhasil dibaca dan dianalisis.")
2025-11-06 07:23:24 +00:00
else:
df = convert_df(tbl[0])
elif ext == ".zip":
zip_type = detect_zip_type(str(tmp_path))
if zip_type == "shp":
print("[INFO] ZIP terdeteksi sebagai Shapefile.")
df = read_shp(str(tmp_path))
elif zip_type == "gdb":
print("[INFO] ZIP terdeteksi sebagai Geodatabase (GDB).")
df = read_gdb(str(tmp_path))
else:
2025-11-17 03:53:15 +00:00
raise errorRes(
2025-11-06 07:23:24 +00:00
status_code=400,
2025-11-17 03:53:15 +00:00
message="ZIP file tidak mengandung SHP atau GDB yang valid."
2025-11-06 07:23:24 +00:00
)
else:
2025-11-17 03:53:15 +00:00
raise errorRes(status_code=400, message="Unsupported file type")
2025-11-06 07:23:24 +00:00
if df is None or (hasattr(df, "empty") and df.empty):
2025-11-17 03:53:15 +00:00
return successRes(message="File berhasil dibaca, Tetapi tidak ditemukan tabel valid")
2025-11-06 07:23:24 +00:00
res = process_data(df, ext)
tmp_path.unlink(missing_ok=True)
2025-11-17 03:53:15 +00:00
return successRes(data=res)
2025-11-06 07:23:24 +00:00
except Exception as e:
print(f"[ERROR] {e}")
2025-11-17 03:53:15 +00:00
return errorRes(
message="Internal Server Error",
details=str(e),
status_code=500
)
2025-11-06 07:23:24 +00:00
# finally:
# db_session.close()
class PdfRequest(BaseModel):
title: str
columns: List[str]
rows: List[List]
async def handle_process_pdf(payload: PdfRequest):
try:
df = convert_df(payload.model_dump())
if df is None or (hasattr(df, "empty") and df.empty):
2025-11-17 03:53:15 +00:00
return errorRes(message="Tidak ada tabel")
2025-11-06 07:23:24 +00:00
res = process_data(df, '.pdf')
2025-11-17 03:53:15 +00:00
return successRes(data=res)
2025-11-06 07:23:24 +00:00
except Exception as e:
print(f"[ERROR] {e}")
2025-11-17 03:53:15 +00:00
return errorRes(message="Internal Server Error", details= str(e), status_code=500)
2025-11-06 07:23:24 +00:00
# finally:
# db_session.close()
class UploadRequest(BaseModel):
title: str
rows: List[dict]
columns: List[str]
2025-11-24 01:57:43 +00:00
author: Dict[str, Any]
2025-11-06 07:23:24 +00:00
2025-11-17 03:53:15 +00:00
2025-11-24 01:57:43 +00:00
async def generate_unique_table_name(base_name: str):
base_name = base_name.lower().replace(" ", "_").replace("-", "_")
table_name = base_name
counter = 2
2025-11-17 03:53:15 +00:00
2025-11-24 01:57:43 +00:00
async with engine.connect() as conn:
while True:
result = await conn.execute(
text("SELECT to_regclass(:tname)"),
{"tname": table_name}
)
exists = result.scalar()
2025-11-17 03:53:15 +00:00
2025-11-24 01:57:43 +00:00
if not exists:
return table_name
2025-11-17 03:53:15 +00:00
2025-11-24 01:57:43 +00:00
table_name = f"{base_name}_{counter}"
counter += 1
2025-11-17 03:53:15 +00:00
2025-11-24 01:57:43 +00:00
def str_to_date(raw_date: str):
if raw_date:
try:
return datetime.strptime(raw_date, "%Y-%m-%d").date()
except Exception as e:
print("[WARNING] Tidak bisa parse dateCreated:", e)
return None
2025-11-06 07:23:24 +00:00
2025-11-24 01:57:43 +00:00
import asyncio
2025-11-17 03:53:15 +00:00
2025-11-24 01:57:43 +00:00
async def handle_to_postgis(payload: UploadRequest, user_id: int = 2):
try:
table_name = await generate_unique_table_name(payload.title)
2025-11-17 03:53:15 +00:00
2025-11-24 01:57:43 +00:00
df = pd.DataFrame(payload.rows)
df.columns = [col.upper() for col in df.columns]
2025-11-17 03:53:15 +00:00
2025-11-24 01:57:43 +00:00
if "GEOMETRY" not in df.columns:
raise HTTPException(400, "Kolom GEOMETRY tidak ditemukan")
2025-11-17 03:53:15 +00:00
2025-11-24 01:57:43 +00:00
df["GEOMETRY"] = df["GEOMETRY"].apply(
lambda g: wkt.loads(g)
if isinstance(g, str) else None
)
2025-11-17 03:53:15 +00:00
2025-11-24 01:57:43 +00:00
gdf = gpd.GeoDataFrame(df, geometry="GEOMETRY", crs="EPSG:4326")
# --- Wajib: gunakan engine sync TANPA asyncpg
loop = asyncio.get_running_loop()
await loop.run_in_executor(
None,
lambda: gdf.to_postgis(
table_name,
sync_engine, # JANGAN ENGINE ASYNC
if_exists="replace",
index=False
)
)
2025-11-17 03:53:15 +00:00
2025-11-24 01:57:43 +00:00
# === STEP 4: add ID column ===
async with engine.begin() as conn:
await conn.execute(text(
f'ALTER TABLE "{table_name}" ADD COLUMN _ID SERIAL PRIMARY KEY;'
))
2025-11-17 03:53:15 +00:00
2025-11-24 01:57:43 +00:00
# === STEP 5: save author metadata ===
author = payload.author
2025-11-17 03:53:15 +00:00
2025-11-24 01:57:43 +00:00
async with engine.begin() as conn:
await conn.execute(text("""
INSERT INTO backend.author_metadata (
table_title,
dataset_title,
dataset_abstract,
keywords,
topic_category,
date_created,
dataset_status,
organization_name,
contact_person_name,
contact_email,
contact_phone,
geom_type,
user_id
) VALUES (
:table_title,
:dataset_title,
:dataset_abstract,
:keywords,
:topic_category,
:date_created,
:dataset_status,
:organization_name,
:contact_person_name,
:contact_email,
:contact_phone,
:geom_type,
:user_id
)
"""), {
"table_title": table_name,
"dataset_title": author.get("title") or payload.title,
"dataset_abstract": author.get("abstract"),
"keywords": author.get("keywords"),
"topic_category": author.get("topicCategory"),
"date_created": str_to_date(author.get("dateCreated")),
"dataset_status": author.get("status"),
"organization_name": author.get("organization"),
"contact_person_name": author.get("contactName"),
"contact_email": author.get("contactEmail"),
"contact_phone": author.get("contactPhone"),
"geom_type": json.dumps(list(gdf.geom_type.unique())),
"user_id": user_id
})
# === STEP 6: log success ===
await log_activity(
user_id=user_id,
action_type="UPLOAD",
action_title=f"Upload dataset {table_name}",
details={"table_name": table_name, "rows": len(gdf)}
)
2025-11-17 03:53:15 +00:00
2025-11-24 01:57:43 +00:00
res = {
"table_name": table_name,
"status": "success",
"message": f"Tabel '{table_name}' berhasil dibuat.",
"total_rows": len(gdf),
"geometry_type": list(gdf.geom_type.unique()),
}
return successRes(data=res)
2025-11-17 03:53:15 +00:00
2025-11-24 01:57:43 +00:00
except Exception as e:
await log_activity(
user_id=user_id,
action_type="ERROR",
action_title="Upload gagal",
details={"error": str(e)}
)
print(f"error : {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
2025-11-17 03:53:15 +00:00
2025-11-06 07:23:24 +00:00
2025-11-17 03:53:15 +00:00
2025-11-06 07:23:24 +00:00