testing qgis server
This commit is contained in:
parent
27ee77331c
commit
f4113e7f0d
BIN
api/deps/__pycache__/auth_dependency.cpython-39.pyc
Normal file
BIN
api/deps/__pycache__/auth_dependency.cpython-39.pyc
Normal file
Binary file not shown.
BIN
api/deps/__pycache__/role_dependency.cpython-39.pyc
Normal file
BIN
api/deps/__pycache__/role_dependency.cpython-39.pyc
Normal file
Binary file not shown.
34
api/deps/auth_dependency.py
Normal file
34
api/deps/auth_dependency.py
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
from fastapi import Depends, Header
|
||||||
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
from sqlalchemy.future import select
|
||||||
|
from datetime import datetime
|
||||||
|
from response import errorRes
|
||||||
|
|
||||||
|
from database.connection import SessionLocal
|
||||||
|
from database.models import User
|
||||||
|
|
||||||
|
async def get_db():
|
||||||
|
async with SessionLocal() as session:
|
||||||
|
yield session
|
||||||
|
|
||||||
|
|
||||||
|
async def get_current_user(
|
||||||
|
authorization: str = Header(None),
|
||||||
|
db: AsyncSession = Depends(get_db)
|
||||||
|
):
|
||||||
|
if not authorization or not authorization.startswith("Bearer "):
|
||||||
|
raise errorRes(status_code=401, message="Missing or invalid token")
|
||||||
|
|
||||||
|
token = authorization.split(" ")[1]
|
||||||
|
result = await db.execute(select(User).where(User.active_token == token))
|
||||||
|
user = result.scalar_one_or_none()
|
||||||
|
|
||||||
|
# Case 1: Token not found → maybe replaced by new login
|
||||||
|
if not user:
|
||||||
|
raise errorRes(status_code=401, message="Token invalid or used by another login")
|
||||||
|
|
||||||
|
# Case 2: Token expired
|
||||||
|
if user.token_expired_at and user.token_expired_at < datetime.utcnow():
|
||||||
|
raise errorRes(status_code=401, message="Token expired")
|
||||||
|
|
||||||
|
return user
|
||||||
20
api/deps/role_dependency.py
Normal file
20
api/deps/role_dependency.py
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
from fastapi import Depends, status
|
||||||
|
from api.deps.auth_dependency import get_current_user
|
||||||
|
from response import errorRes
|
||||||
|
|
||||||
|
def require_role(required_role: str):
|
||||||
|
"""
|
||||||
|
Return a dependency function that ensures the current user has a specific role.
|
||||||
|
Example usage:
|
||||||
|
@router.get("/admin", dependencies=[Depends(require_role("admin"))])
|
||||||
|
"""
|
||||||
|
async def role_checker(user = Depends(get_current_user)):
|
||||||
|
if user.role != required_role:
|
||||||
|
raise errorRes(
|
||||||
|
status_code=status.HTTP_403_FORBIDDEN,
|
||||||
|
message="Access denied",
|
||||||
|
detail=f"Access denied: requires role '{required_role}'",
|
||||||
|
)
|
||||||
|
return user
|
||||||
|
|
||||||
|
return role_checker
|
||||||
BIN
api/routers/__pycache__/auth_router.cpython-39.pyc
Normal file
BIN
api/routers/__pycache__/auth_router.cpython-39.pyc
Normal file
Binary file not shown.
BIN
api/routers/__pycache__/system_router.cpython-39.pyc
Normal file
BIN
api/routers/__pycache__/system_router.cpython-39.pyc
Normal file
Binary file not shown.
BIN
api/routers/__pycache__/upload_file_router.cpython-39.pyc
Normal file
BIN
api/routers/__pycache__/upload_file_router.cpython-39.pyc
Normal file
Binary file not shown.
14
api/routers/auth_router.py
Normal file
14
api/routers/auth_router.py
Normal file
|
|
@ -0,0 +1,14 @@
|
||||||
|
from fastapi import APIRouter, Depends
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
from services.auth.login import loginService, get_db
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
class LoginRequest(BaseModel):
|
||||||
|
username: str
|
||||||
|
password: str
|
||||||
|
|
||||||
|
@router.post("/login")
|
||||||
|
async def login(request: LoginRequest, db: AsyncSession = Depends(get_db)):
|
||||||
|
return await loginService(request.username, request.password, db)
|
||||||
20
api/routers/datasets_router.py
Normal file
20
api/routers/datasets_router.py
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
from fastapi import APIRouter
|
||||||
|
from core.config import engine
|
||||||
|
from services.datasets.delete import delete_dataset_from_partition # import fungsi di atas
|
||||||
|
from response import successRes, errorRes
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
@router.delete("/dataset/{user_id}/{metadata_id}")
|
||||||
|
async def delete_dataset(user_id: int, metadata_id: int, title: str):
|
||||||
|
"""
|
||||||
|
Hapus dataset tertentu (berdasarkan user_id dan metadata_id)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
async with engine.begin() as conn:
|
||||||
|
await delete_dataset_from_partition(conn, user_id, metadata_id, title)
|
||||||
|
return successRes(message=f"Dataset {title} berhasil dihapus.", data="")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[ERROR] Gagal hapus dataset: {e}")
|
||||||
|
raise errorRes(status_code=500, details=str(e), message="Gagal hapus dataset")
|
||||||
|
|
@ -1,13 +1,15 @@
|
||||||
|
|
||||||
from fastapi import APIRouter, File, Form, UploadFile
|
from fastapi import APIRouter, File, Form, UploadFile, Depends
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
from services.upload_file.upload import handle_upload_file, handle_process_pdf, handle_to_postgis
|
from services.upload_file.upload import handle_upload_file, handle_process_pdf, handle_to_postgis
|
||||||
|
from api.deps.role_dependency import require_role
|
||||||
|
from database.connection import engine
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
@router.post("/file")
|
@router.post("/file")
|
||||||
|
# async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form(""), user = Depends(require_role("admin"))):
|
||||||
async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form("")):
|
async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form("")):
|
||||||
return await handle_upload_file(file, page, sheet)
|
return await handle_upload_file(file, page, sheet)
|
||||||
|
|
||||||
|
|
@ -30,5 +32,5 @@ class UploadRequest(BaseModel):
|
||||||
columns: List[str]
|
columns: List[str]
|
||||||
|
|
||||||
@router.post("/to-postgis")
|
@router.post("/to-postgis")
|
||||||
def upload_to_postgis(payload: UploadRequest):
|
async def upload_to_postgis(payload: UploadRequest):
|
||||||
return handle_to_postgis(payload)
|
return await handle_to_postgis(payload, engine)
|
||||||
|
|
@ -1,6 +1,9 @@
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.ext.asyncio import create_async_engine
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
|
||||||
from core.config import POSTGIS_URL
|
from core.config import POSTGIS_URL
|
||||||
|
|
||||||
engine = create_engine(POSTGIS_URL, pool_pre_ping=True)
|
engine = create_async_engine(POSTGIS_URL, pool_pre_ping=True)
|
||||||
SessionLocal = sessionmaker(bind=engine)
|
# SessionLocal = sessionmaker(bind=engine)
|
||||||
|
SessionLocal = async_sessionmaker(engine, expire_on_commit=False)
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
from sqlalchemy import Column, Integer, String, Text, TIMESTAMP
|
from sqlalchemy import Column, Integer, String, Text, ForeignKey, DateTime, TIMESTAMP
|
||||||
|
from sqlalchemy.orm import relationship
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
from sqlalchemy.sql import func
|
from sqlalchemy.sql import func
|
||||||
|
|
||||||
|
|
@ -14,3 +15,28 @@ class UploadLog(Base):
|
||||||
uploaded_at = Column(TIMESTAMP, server_default=func.now())
|
uploaded_at = Column(TIMESTAMP, server_default=func.now())
|
||||||
status = Column(String)
|
status = Column(String)
|
||||||
message = Column(Text)
|
message = Column(Text)
|
||||||
|
|
||||||
|
|
||||||
|
class Institution(Base):
|
||||||
|
__tablename__ = "institutions"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True, index=True)
|
||||||
|
name = Column(String(100), unique=True, nullable=False)
|
||||||
|
address = Column(String(200), nullable=True)
|
||||||
|
|
||||||
|
users = relationship("User", back_populates="institution")
|
||||||
|
|
||||||
|
|
||||||
|
class User(Base):
|
||||||
|
__tablename__ = "users"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True, index=True)
|
||||||
|
username = Column(String(50), unique=True, nullable=False)
|
||||||
|
password_hash = Column(String(255), nullable=False)
|
||||||
|
role = Column(String(50), nullable=False, default="user") # <── Added role
|
||||||
|
institution_id = Column(Integer, ForeignKey("institutions.id"), nullable=True)
|
||||||
|
active_token = Column(String(255), nullable=True)
|
||||||
|
token_expired_at = Column(DateTime, nullable=True)
|
||||||
|
last_login = Column(DateTime, nullable=True)
|
||||||
|
|
||||||
|
institution = relationship("Institution", back_populates="users")
|
||||||
|
|
@ -1,16 +0,0 @@
|
||||||
import geopandas as gpd
|
|
||||||
import pandas as pd
|
|
||||||
from database.connection import engine
|
|
||||||
from sqlalchemy import text
|
|
||||||
|
|
||||||
def save_dataframe_dynamic(df: pd.DataFrame, table_name: str):
|
|
||||||
"""Save pandas DataFrame to Postgres (non-geo)."""
|
|
||||||
df.to_sql(table_name, engine, if_exists="replace", index=False, method='multi', chunksize=1000)
|
|
||||||
|
|
||||||
def save_geodataframe(gdf: gpd.GeoDataFrame, table_name: str):
|
|
||||||
"""Save GeoDataFrame to PostGIS (requires geoalchemy/geopandas)."""
|
|
||||||
# ensure geometry column exists and CRS set
|
|
||||||
if gdf.crs is None:
|
|
||||||
gdf = gdf.set_crs("EPSG:4326", allow_override=True)
|
|
||||||
# geopandas >= 0.10 has to_postgis in some installs; fallback using SQLAlchemy + GeoAlchemy2:
|
|
||||||
gdf.to_postgis(table_name, engine, if_exists="replace")
|
|
||||||
|
|
@ -1,3 +0,0 @@
|
||||||
from database.connection import engine
|
|
||||||
from database.models import Base
|
|
||||||
Base.metadata.create_all(bind=engine)
|
|
||||||
40
main.py
40
main.py
|
|
@ -3,9 +3,11 @@ from fastapi.middleware.cors import CORSMiddleware
|
||||||
from core.config import API_VERSION, ALLOWED_ORIGINS
|
from core.config import API_VERSION, ALLOWED_ORIGINS
|
||||||
from database.connection import engine
|
from database.connection import engine
|
||||||
from database.models import Base
|
from database.models import Base
|
||||||
from routes.router import router as system_router
|
from api.routers.system_router import router as system_router
|
||||||
from routes.upload_file_router import router as upload_router
|
from api.routers.upload_file_router import router as upload_router
|
||||||
from routes.auth_router import router as auth_router
|
from api.routers.auth_router import router as auth_router
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
from utils.qgis_init import init_qgis
|
||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="ETL Geo Upload Service",
|
title="ETL Geo Upload Service",
|
||||||
|
|
@ -21,9 +23,37 @@ app.add_middleware(
|
||||||
allow_headers=["*"],
|
allow_headers=["*"],
|
||||||
)
|
)
|
||||||
|
|
||||||
Base.metadata.create_all(bind=engine)
|
# Base.metadata.create_all(bind=engine)
|
||||||
|
|
||||||
|
# qgis setup
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(app: FastAPI):
|
||||||
|
global qgs
|
||||||
|
qgs = init_qgis()
|
||||||
|
print("QGIS initialized")
|
||||||
|
|
||||||
|
yield
|
||||||
|
|
||||||
|
# SHUTDOWN (optional)
|
||||||
|
print("Shutting down...")
|
||||||
|
|
||||||
|
app = FastAPI(lifespan=lifespan)
|
||||||
|
|
||||||
|
@app.get("/qgis/status")
|
||||||
|
def qgis_status():
|
||||||
|
try:
|
||||||
|
from qgis.core import Qgis
|
||||||
|
return {
|
||||||
|
"qgis_status": "connected",
|
||||||
|
"qgis_version": Qgis.QGIS_VERSION
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"qgis_status": "error",
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
|
|
||||||
# Register routers
|
# Register routers
|
||||||
app.include_router(system_router, tags=["System"])
|
app.include_router(system_router, tags=["System"])
|
||||||
app.include_router(auth_router, prefix="/auth", tags=["Auth"])
|
app.include_router(auth_router, prefix="/auth", tags=["Auth"])
|
||||||
app.include_router(upload_router, prefix="/upload", tags=["Upload"])
|
app.include_router(upload_router, prefix="/upload", tags=["Upload"])
|
||||||
22
response.py
Normal file
22
response.py
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
from fastapi import HTTPException
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
|
def successRes(data=None, message="Success", status_code=200):
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=status_code,
|
||||||
|
content={
|
||||||
|
"status": "success",
|
||||||
|
"message": message,
|
||||||
|
"data": data,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
def errorRes(message="Error", status_code=400, details=None):
|
||||||
|
return HTTPException(
|
||||||
|
status_code=status_code,
|
||||||
|
detail={
|
||||||
|
"status": "error",
|
||||||
|
"message": message,
|
||||||
|
"details": details
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
from fastapi import APIRouter
|
|
||||||
from core.config import API_VERSION
|
|
||||||
|
|
||||||
router = APIRouter()
|
|
||||||
|
|
||||||
@router.get("/login")
|
|
||||||
async def login():
|
|
||||||
return {"status": "success"}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
BIN
services/.DS_Store
vendored
Normal file
BIN
services/.DS_Store
vendored
Normal file
Binary file not shown.
49
services/auth/login.py
Normal file
49
services/auth/login.py
Normal file
|
|
@ -0,0 +1,49 @@
|
||||||
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
from sqlalchemy.future import select
|
||||||
|
from passlib.context import CryptContext
|
||||||
|
from uuid import uuid4
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from database.connection import SessionLocal
|
||||||
|
from database.models import User
|
||||||
|
from response import successRes, errorRes
|
||||||
|
|
||||||
|
async def get_db():
|
||||||
|
async with SessionLocal() as session:
|
||||||
|
yield session
|
||||||
|
|
||||||
|
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
|
||||||
|
|
||||||
|
async def loginService(username: str, password: str, db: AsyncSession):
|
||||||
|
result = await db.execute(select(User).where(User.username == username))
|
||||||
|
user = result.scalar_one_or_none()
|
||||||
|
|
||||||
|
if not user:
|
||||||
|
raise errorRes(status_code=401, message="Invalid username or password")
|
||||||
|
|
||||||
|
# Verify password
|
||||||
|
if not pwd_context.verify(password, user.password_hash):
|
||||||
|
raise errorRes(status_code=401, message="Invalid username or password")
|
||||||
|
|
||||||
|
# Validation for institution user
|
||||||
|
if user.role != "admin" and not user.institution_id:
|
||||||
|
raise errorRes(status_code=403, message="User must belong to an institution")
|
||||||
|
|
||||||
|
# Generate single active token
|
||||||
|
token = str(uuid4())
|
||||||
|
expiry = datetime.utcnow() + timedelta(hours=4)
|
||||||
|
|
||||||
|
user.active_token = token
|
||||||
|
user.token_expired_at = expiry
|
||||||
|
user.last_login = datetime.utcnow()
|
||||||
|
await db.commit()
|
||||||
|
|
||||||
|
res = {
|
||||||
|
"status": "success",
|
||||||
|
"username": user.username,
|
||||||
|
"role": user.role,
|
||||||
|
"institution_id": user.institution_id,
|
||||||
|
"token": token,
|
||||||
|
"token_expired_at": expiry.isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
return successRes(message="Success Login", data=res)
|
||||||
33
services/datasets/delete.py
Normal file
33
services/datasets/delete.py
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
import re
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
def slugify(value: str):
|
||||||
|
return re.sub(r'[^a-zA-Z0-9]+', '_', value.lower()).strip('_')
|
||||||
|
|
||||||
|
|
||||||
|
async def delete_dataset_from_partition(conn, user_id: int, metadata_id: int, title: str):
|
||||||
|
"""
|
||||||
|
Menghapus dataset tertentu pada partisi user tertentu.
|
||||||
|
- Hapus semua record di partisi (test_partition_user_{id})
|
||||||
|
- Hapus metadata dari dataset_metadata
|
||||||
|
- Hapus VIEW QGIS terkait
|
||||||
|
"""
|
||||||
|
base_table = f"test_partition_user_{user_id}"
|
||||||
|
norm_title = slugify(title)
|
||||||
|
view_name = f"v_user_{user_id}_{norm_title}"
|
||||||
|
|
||||||
|
print(f"[INFO] Menghapus dataset metadata_id={metadata_id} milik user_id={user_id}...")
|
||||||
|
|
||||||
|
# 1️⃣ Hapus data spasial dari partisi
|
||||||
|
await conn.execute(text(f"DELETE FROM {base_table} WHERE metadata_id = :mid;"), {"mid": metadata_id})
|
||||||
|
print(f"[INFO] Data spasial dari partisi {base_table} (metadata_id={metadata_id}) dihapus.")
|
||||||
|
|
||||||
|
# 2️⃣ Hapus metadata dari dataset_metadata
|
||||||
|
await conn.execute(text("DELETE FROM dataset_metadata WHERE id = :mid;"), {"mid": metadata_id})
|
||||||
|
print(f"[INFO] Metadata dataset id={metadata_id} dihapus dari tabel dataset_metadata.")
|
||||||
|
|
||||||
|
# 3️⃣ Hapus view terkait di QGIS
|
||||||
|
await conn.execute(text(f"DROP VIEW IF EXISTS {view_name} CASCADE;"))
|
||||||
|
print(f"[INFO] View {view_name} dihapus (jika ada).")
|
||||||
|
|
||||||
|
print("[INFO] ✅ Penghapusan dataset selesai.")
|
||||||
BIN
services/upload_file/.DS_Store
vendored
Normal file
BIN
services/upload_file/.DS_Store
vendored
Normal file
Binary file not shown.
|
|
@ -1,283 +0,0 @@
|
||||||
# import pandas as pd
|
|
||||||
# import re
|
|
||||||
# import csv
|
|
||||||
# import os
|
|
||||||
|
|
||||||
# def detect_header_line(path, max_rows=10):
|
|
||||||
# with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
||||||
# lines = [next(f) for _ in range(max_rows)]
|
|
||||||
|
|
||||||
# header_line_idx = 0
|
|
||||||
# best_score = -1
|
|
||||||
|
|
||||||
# for i, line in enumerate(lines):
|
|
||||||
# cells = re.split(r'[;,|\t]', line.strip())
|
|
||||||
# alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
|
|
||||||
# digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
|
|
||||||
# score = alpha_ratio - digit_ratio
|
|
||||||
|
|
||||||
# if score > best_score:
|
|
||||||
# best_score = score
|
|
||||||
# header_line_idx = i
|
|
||||||
|
|
||||||
# return header_line_idx
|
|
||||||
|
|
||||||
|
|
||||||
# def detect_delimiter(path, sample_size=2048):
|
|
||||||
# with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
||||||
# sample = f.read(sample_size)
|
|
||||||
# sniffer = csv.Sniffer()
|
|
||||||
# try:
|
|
||||||
# dialect = sniffer.sniff(sample)
|
|
||||||
# return dialect.delimiter
|
|
||||||
# except Exception:
|
|
||||||
# for delim in [',', ';', '\t', '|']:
|
|
||||||
# if delim in sample:
|
|
||||||
# return delim
|
|
||||||
# return ','
|
|
||||||
|
|
||||||
|
|
||||||
# def read_csv(path: str):
|
|
||||||
# ext = os.path.splitext(path)[1].lower() # ambil ekstensi file
|
|
||||||
|
|
||||||
# try:
|
|
||||||
# if ext in ['.csv', '.txt']:
|
|
||||||
# # === Baca file CSV ===
|
|
||||||
# header_line = detect_header_line(path)
|
|
||||||
# delimiter = detect_delimiter(path)
|
|
||||||
# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
|
|
||||||
|
|
||||||
# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
|
|
||||||
|
|
||||||
# elif ext in ['.xlsx', '.xls']:
|
|
||||||
# # === Baca file Excel ===
|
|
||||||
# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
|
|
||||||
# pre_df = pd.read_excel(path, header=0, dtype=str) # baca semua sebagai string
|
|
||||||
# df = pre_df.copy()
|
|
||||||
# for col in df.columns:
|
|
||||||
# if df[col].str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
|
|
||||||
# df[col] = df[col].str.replace(',', '', regex=False)
|
|
||||||
# df[col] = pd.to_numeric(df[col], errors='ignore')
|
|
||||||
|
|
||||||
# else:
|
|
||||||
# raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)")
|
|
||||||
|
|
||||||
# except Exception as e:
|
|
||||||
# print(f"[WARN] Gagal membaca file ({e}), fallback ke default")
|
|
||||||
# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
|
|
||||||
|
|
||||||
# # Bersihkan kolom dan baris kosong
|
|
||||||
# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
|
|
||||||
# df.columns = [str(c).strip() for c in df.columns]
|
|
||||||
# df = df.dropna(how='all')
|
|
||||||
|
|
||||||
# return df
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import re
|
|
||||||
import csv
|
|
||||||
import os
|
|
||||||
|
|
||||||
def detect_header_line(path, max_rows=10):
|
|
||||||
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
||||||
lines = [next(f) for _ in range(max_rows)]
|
|
||||||
header_line_idx = 0
|
|
||||||
best_score = -1
|
|
||||||
for i, line in enumerate(lines):
|
|
||||||
cells = re.split(r'[;,|\t]', line.strip())
|
|
||||||
alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
|
|
||||||
digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
|
|
||||||
score = alpha_ratio - digit_ratio
|
|
||||||
if score > best_score:
|
|
||||||
best_score = score
|
|
||||||
header_line_idx = i
|
|
||||||
return header_line_idx
|
|
||||||
|
|
||||||
def detect_delimiter(path, sample_size=2048):
|
|
||||||
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
||||||
sample = f.read(sample_size)
|
|
||||||
sniffer = csv.Sniffer()
|
|
||||||
try:
|
|
||||||
dialect = sniffer.sniff(sample)
|
|
||||||
return dialect.delimiter
|
|
||||||
except Exception:
|
|
||||||
for delim in [',', ';', '\t', '|']:
|
|
||||||
if delim in sample:
|
|
||||||
return delim
|
|
||||||
return ','
|
|
||||||
|
|
||||||
# def read_csv(path: str):
|
|
||||||
# ext = os.path.splitext(path)[1].lower()
|
|
||||||
|
|
||||||
# try:
|
|
||||||
# if ext in ['.csv']:
|
|
||||||
# # === Baca file CSV ===
|
|
||||||
# header_line = detect_header_line(path)
|
|
||||||
# delimiter = detect_delimiter(path)
|
|
||||||
# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
|
|
||||||
|
|
||||||
# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
|
|
||||||
|
|
||||||
# elif ext in ['.xlsx', '.xls']:
|
|
||||||
# # === Baca file Excel ===
|
|
||||||
# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
|
|
||||||
# xls = pd.ExcelFile(path)
|
|
||||||
|
|
||||||
# print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
|
|
||||||
|
|
||||||
# # Evaluasi tiap sheet untuk mencari yang paling relevan
|
|
||||||
# best_sheet = None
|
|
||||||
# best_score = -1
|
|
||||||
# best_df = None
|
|
||||||
|
|
||||||
# for sheet_name in xls.sheet_names:
|
|
||||||
# try:
|
|
||||||
# df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
|
|
||||||
# df = df.dropna(how='all').dropna(axis=1, how='all')
|
|
||||||
|
|
||||||
# if len(df) == 0 or len(df.columns) < 2:
|
|
||||||
# continue
|
|
||||||
|
|
||||||
# # hitung "skor relevansi"
|
|
||||||
# text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1)
|
|
||||||
# row_score = len(df)
|
|
||||||
# score = (row_score * 0.7) + (text_ratio * 100)
|
|
||||||
|
|
||||||
# if score > best_score:
|
|
||||||
# best_score = score
|
|
||||||
# best_sheet = sheet_name
|
|
||||||
# best_df = df
|
|
||||||
|
|
||||||
# except Exception as e:
|
|
||||||
# print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
|
|
||||||
# continue
|
|
||||||
|
|
||||||
# if best_df is not None:
|
|
||||||
# print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
|
|
||||||
# df = best_df
|
|
||||||
# else:
|
|
||||||
# raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
|
|
||||||
|
|
||||||
# # Konversi tipe numerik jika ada
|
|
||||||
# for col in df.columns:
|
|
||||||
# if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
|
|
||||||
# df[col] = df[col].astype(str).str.replace(',', '', regex=False)
|
|
||||||
# df[col] = pd.to_numeric(df[col], errors='ignore')
|
|
||||||
|
|
||||||
# else:
|
|
||||||
# raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
|
|
||||||
|
|
||||||
# except Exception as e:
|
|
||||||
# print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
|
|
||||||
# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
|
|
||||||
|
|
||||||
# # Bersihkan kolom dan baris kosong
|
|
||||||
# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
|
|
||||||
# df.columns = [str(c).strip() for c in df.columns]
|
|
||||||
# df = df.dropna(how='all')
|
|
||||||
|
|
||||||
# return df
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def read_csv(path: str, sheet: str = None):
|
|
||||||
ext = os.path.splitext(path)[1].lower()
|
|
||||||
|
|
||||||
try:
|
|
||||||
if ext in ['.csv']:
|
|
||||||
# === Baca file CSV ===
|
|
||||||
header_line = detect_header_line(path)
|
|
||||||
delimiter = detect_delimiter(path)
|
|
||||||
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
|
|
||||||
|
|
||||||
df = pd.read_csv(
|
|
||||||
path,
|
|
||||||
header=header_line,
|
|
||||||
sep=delimiter,
|
|
||||||
encoding='utf-8',
|
|
||||||
low_memory=False,
|
|
||||||
thousands=','
|
|
||||||
)
|
|
||||||
|
|
||||||
elif ext in ['.xlsx', '.xls']:
|
|
||||||
# === Baca file Excel ===
|
|
||||||
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
|
|
||||||
xls = pd.ExcelFile(path)
|
|
||||||
print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
|
|
||||||
|
|
||||||
# === Jika user memberikan nama sheet ===
|
|
||||||
if sheet:
|
|
||||||
if sheet not in xls.sheet_names:
|
|
||||||
raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}")
|
|
||||||
print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
|
|
||||||
df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str)
|
|
||||||
df = df.dropna(how='all').dropna(axis=1, how='all')
|
|
||||||
|
|
||||||
else:
|
|
||||||
# === Auto-detect sheet terbaik ===
|
|
||||||
print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...")
|
|
||||||
best_sheet = None
|
|
||||||
best_score = -1
|
|
||||||
best_df = None
|
|
||||||
|
|
||||||
for sheet_name in xls.sheet_names:
|
|
||||||
try:
|
|
||||||
temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
|
|
||||||
temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all')
|
|
||||||
|
|
||||||
if len(temp_df) == 0 or len(temp_df.columns) < 2:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# hitung skor relevansi
|
|
||||||
text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1)
|
|
||||||
row_score = len(temp_df)
|
|
||||||
score = (row_score * 0.7) + (text_ratio * 100)
|
|
||||||
|
|
||||||
if score > best_score:
|
|
||||||
best_score = score
|
|
||||||
best_sheet = sheet_name
|
|
||||||
best_df = temp_df
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if best_df is not None:
|
|
||||||
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
|
|
||||||
df = best_df
|
|
||||||
else:
|
|
||||||
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
|
|
||||||
|
|
||||||
# Konversi tipe numerik jika ada
|
|
||||||
for col in df.columns:
|
|
||||||
if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
|
|
||||||
df[col] = df[col].astype(str).str.replace(',', '', regex=False)
|
|
||||||
df[col] = pd.to_numeric(df[col], errors='ignore')
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
|
|
||||||
df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
|
|
||||||
|
|
||||||
# Bersihkan kolom dan baris kosong
|
|
||||||
df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
|
|
||||||
df.columns = [str(c).strip() for c in df.columns]
|
|
||||||
df = df.dropna(how='all')
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
@ -1,47 +0,0 @@
|
||||||
import re
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
geo_admin_keywords = [
|
|
||||||
'lat', 'lon', 'long', 'latitude', 'longitude', 'koordinat', 'geometry', 'geometri',
|
|
||||||
'desa', 'kelurahan', 'kel', 'kecamatan', 'kabupaten', 'kab', 'kota', 'provinsi',
|
|
||||||
'lokasi', 'region', 'area', 'zone', 'boundary', 'batas'
|
|
||||||
]
|
|
||||||
|
|
||||||
def normalize_text(text):
|
|
||||||
text = text.lower()
|
|
||||||
text = re.sub(r'[^a-z0-9/ ]+', ' ', text)
|
|
||||||
text = re.sub(r'\s+', ' ', text).strip()
|
|
||||||
return text
|
|
||||||
|
|
||||||
def generate_combined_patterns(keywords):
|
|
||||||
combos = list(itertools.combinations(keywords, 2))
|
|
||||||
patterns = []
|
|
||||||
for a, b in combos:
|
|
||||||
patterns.append(rf'{a}\s*/\s*{b}')
|
|
||||||
patterns.append(rf'{b}\s*/\s*{a}')
|
|
||||||
return patterns
|
|
||||||
|
|
||||||
combined_patterns = generate_combined_patterns(geo_admin_keywords)
|
|
||||||
|
|
||||||
def contains_geo_admin_keywords(text):
|
|
||||||
text_clean = normalize_text(text)
|
|
||||||
if len(text_clean) < 3:
|
|
||||||
return False
|
|
||||||
|
|
||||||
for pattern in combined_patterns:
|
|
||||||
if re.search(pattern, text_clean):
|
|
||||||
return True
|
|
||||||
|
|
||||||
for kw in geo_admin_keywords:
|
|
||||||
if re.search(rf'(^|[\s/_-]){kw}([\s/_-]|$)', text_clean):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def filter_geo_admin_column(tables):
|
|
||||||
filtered = []
|
|
||||||
for table in tables:
|
|
||||||
found = any(contains_geo_admin_keywords(col) for col in table['columns'])
|
|
||||||
if found:
|
|
||||||
filtered.append(table)
|
|
||||||
return filtered
|
|
||||||
|
|
@ -1,270 +0,0 @@
|
||||||
import pdfplumber
|
|
||||||
import re
|
|
||||||
import pandas as pd
|
|
||||||
from services.upload_file.read_pdf.filter_column import filter_geo_admin_column
|
|
||||||
|
|
||||||
def is_number(s):
|
|
||||||
if s is None:
|
|
||||||
return False
|
|
||||||
s = str(s).strip().replace(',', '').replace('.', '')
|
|
||||||
return s.isdigit()
|
|
||||||
|
|
||||||
def row_ratio(row):
|
|
||||||
non_empty = [c for c in row if c not in (None, '', ' ')]
|
|
||||||
if not non_empty:
|
|
||||||
return 0
|
|
||||||
num_count = sum(is_number(c) for c in non_empty)
|
|
||||||
return num_count / len(non_empty)
|
|
||||||
|
|
||||||
def has_mixed_text_and_numbers(row):
|
|
||||||
non_empty = [c for c in row if c not in (None, '', ' ')]
|
|
||||||
has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty)
|
|
||||||
has_num = any(is_number(c) for c in non_empty)
|
|
||||||
return has_text and has_num
|
|
||||||
|
|
||||||
def is_short_text_row(row):
|
|
||||||
"""Deteksi baris teks pendek (1-2 kolom teks pendek)."""
|
|
||||||
non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')]
|
|
||||||
if not non_empty:
|
|
||||||
return False
|
|
||||||
text_only = all(not is_number(c) for c in non_empty)
|
|
||||||
joined = " ".join(non_empty)
|
|
||||||
return text_only and len(non_empty) <= 2 and len(joined) < 20
|
|
||||||
|
|
||||||
def detect_header_rows(rows):
|
|
||||||
if not rows:
|
|
||||||
return []
|
|
||||||
|
|
||||||
ratios = [row_ratio(r) for r in rows]
|
|
||||||
body_start_index = None
|
|
||||||
|
|
||||||
for i in range(1, len(rows)):
|
|
||||||
row = rows[i]
|
|
||||||
if has_mixed_text_and_numbers(row):
|
|
||||||
body_start_index = i
|
|
||||||
break
|
|
||||||
if ratios[i] > 0.3:
|
|
||||||
body_start_index = i
|
|
||||||
break
|
|
||||||
if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row):
|
|
||||||
body_start_index = i
|
|
||||||
break
|
|
||||||
if ratios[i - 1] == 0 and ratios[i] > 0:
|
|
||||||
body_start_index = i
|
|
||||||
break
|
|
||||||
|
|
||||||
if body_start_index is None:
|
|
||||||
body_start_index = len(rows)
|
|
||||||
|
|
||||||
potential_headers = rows[:body_start_index]
|
|
||||||
body_filtered = rows[body_start_index:]
|
|
||||||
header_filtered = []
|
|
||||||
for idx, row in enumerate(potential_headers):
|
|
||||||
if is_short_text_row(row):
|
|
||||||
if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0:
|
|
||||||
header_filtered.append(row)
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
header_filtered.append(row)
|
|
||||||
|
|
||||||
return header_filtered, body_filtered
|
|
||||||
|
|
||||||
|
|
||||||
def merge_multiline_header(header_rows):
|
|
||||||
final_header = []
|
|
||||||
for col in zip(*header_rows):
|
|
||||||
val = next((v for v in reversed(col) if v and str(v).strip()), '')
|
|
||||||
val = str(val).replace('\n', ' ').strip()
|
|
||||||
final_header.append(val)
|
|
||||||
final_header = [v for v in final_header if v not in ['', None]]
|
|
||||||
return final_header
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
NUMBER_HEADER_KEYWORDS = ["no","no.","no .","no . ","no :","no : ","nomor","nomor.","nomor :","nomor urut","no urut","no. urut","no-urut","no_urut","nomor_urut","nomor-urut","No","NO","NO.","No.","No :","NO :","Nomor","NOMOR","Nomor Urut","NOMOR URUT","No Urut","NO URUT","No. Urut","NO. URUT","No /","No / ","No / Nama","No -","No - ","Nomor /","Nomor -","Number","No. of","No of","Index","Serial","Order","ID","ID No","ID No.","Sr No","Sr. No","S/N","SN","Sl No","Sl. No","N0","N0.","N0 :","NOM0R","NOM0R URUT","N0MOR",]
|
|
||||||
|
|
||||||
def has_number_header(header):
|
|
||||||
"""Periksa apakah header mengandung kolom No/Nomor."""
|
|
||||||
header_text = header
|
|
||||||
return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS)
|
|
||||||
|
|
||||||
def is_numbering_column(col_values):
|
|
||||||
"""Periksa apakah kolom pertama diisi nomor urut seperti 1, 01, 2, dst."""
|
|
||||||
numeric_like = 0
|
|
||||||
total = 0
|
|
||||||
for v in col_values:
|
|
||||||
if not v or not isinstance(v, str):
|
|
||||||
continue
|
|
||||||
total += 1
|
|
||||||
if re.fullmatch(r"0*\d{1,3}", v.strip()):
|
|
||||||
numeric_like += 1
|
|
||||||
return total > 0 and (numeric_like / total) > 0.6
|
|
||||||
|
|
||||||
def is_numeric_value(v):
|
|
||||||
"""Cek apakah suatu nilai termasuk angka (int, float, atau string angka)."""
|
|
||||||
if v is None:
|
|
||||||
return False
|
|
||||||
if isinstance(v, (int, float)):
|
|
||||||
return True
|
|
||||||
if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def cleaning_column(headers, bodies):
|
|
||||||
cleaned_bodies = []
|
|
||||||
|
|
||||||
for header, body in zip(headers, bodies):
|
|
||||||
if not body:
|
|
||||||
cleaned_bodies.append(body)
|
|
||||||
continue
|
|
||||||
|
|
||||||
header_has_number = has_number_header(header)
|
|
||||||
first_col = [row[0] for row in body if row and len(row) > 0]
|
|
||||||
first_col_is_numbering = is_numbering_column(first_col)
|
|
||||||
|
|
||||||
if not header_has_number and first_col_is_numbering:
|
|
||||||
new_body = []
|
|
||||||
for row in body:
|
|
||||||
if not row:
|
|
||||||
continue
|
|
||||||
first_val = row[0]
|
|
||||||
if is_numeric_value(first_val) and len(row) > 1:
|
|
||||||
new_body.append(row[1:])
|
|
||||||
else:
|
|
||||||
new_body.append(row)
|
|
||||||
body = new_body
|
|
||||||
|
|
||||||
header_len = len(headers)
|
|
||||||
filtered_body = [row for row in body if len(row) == header_len]
|
|
||||||
|
|
||||||
cleaned_bodies.append(filtered_body)
|
|
||||||
|
|
||||||
return cleaned_bodies
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_page_selection(selectedPage: str, total_pages: int):
|
|
||||||
if not selectedPage:
|
|
||||||
return list(range(1, total_pages + 1))
|
|
||||||
|
|
||||||
pages = set()
|
|
||||||
parts = re.split(r'[,\s]+', selectedPage.strip())
|
|
||||||
|
|
||||||
for part in parts:
|
|
||||||
if '-' in part:
|
|
||||||
try:
|
|
||||||
start, end = map(int, part.split('-'))
|
|
||||||
pages.update(range(start, end + 1))
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
pages.add(int(part))
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages]
|
|
||||||
return valid_pages
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def read_pdf(path: str, page: str):
|
|
||||||
pdf_path = path
|
|
||||||
selectedPage = None
|
|
||||||
# if page == '' or None:
|
|
||||||
# selectedPage = "1"
|
|
||||||
if not page:
|
|
||||||
selectedPage = "1"
|
|
||||||
else:
|
|
||||||
selectedPage = page
|
|
||||||
tables_data = []
|
|
||||||
with pdfplumber.open(pdf_path) as pdf:
|
|
||||||
total_pages = len(pdf.pages)
|
|
||||||
selected_pages = parse_page_selection(selectedPage, total_pages)
|
|
||||||
|
|
||||||
print(f"[INFO] Total halaman PDF: {total_pages}")
|
|
||||||
print(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
|
|
||||||
|
|
||||||
for page_num in selected_pages:
|
|
||||||
pdf_page = pdf.pages[page_num - 1] # index pdfplumber mulai dari 0
|
|
||||||
tables = pdf_page.find_tables()
|
|
||||||
print(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
|
|
||||||
|
|
||||||
for t in tables:
|
|
||||||
table = t.extract()
|
|
||||||
if len(table) > 2:
|
|
||||||
tables_data.append(table)
|
|
||||||
|
|
||||||
print(f"\nTotal tabel valid: {len(tables_data)}\n")
|
|
||||||
|
|
||||||
header_only = []
|
|
||||||
body_only = []
|
|
||||||
for tbl in tables_data:
|
|
||||||
head, body = detect_header_rows(tbl)
|
|
||||||
header_only.append(head)
|
|
||||||
body_only.append(body)
|
|
||||||
|
|
||||||
clean_header = []
|
|
||||||
for h in header_only:
|
|
||||||
clean_header.append(merge_multiline_header(h))
|
|
||||||
|
|
||||||
clean_body=[]
|
|
||||||
for i, raw_body in enumerate(body_only):
|
|
||||||
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
|
|
||||||
cleaned = cleaning_column(clean_header[i], [con_body])
|
|
||||||
clean_body.append(cleaned[0])
|
|
||||||
|
|
||||||
parsed = []
|
|
||||||
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
|
|
||||||
parsed.append({
|
|
||||||
"title": str(i),
|
|
||||||
"columns": cols,
|
|
||||||
"rows": rows
|
|
||||||
})
|
|
||||||
|
|
||||||
clean_parsed = filter_geo_admin_column(parsed)
|
|
||||||
# print(f"parsed{clean_parsed}")
|
|
||||||
return clean_parsed
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def convert_df(payload):
|
|
||||||
if "columns" not in payload or "rows" not in payload:
|
|
||||||
raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.")
|
|
||||||
|
|
||||||
if not isinstance(payload["columns"], list):
|
|
||||||
raise TypeError("'columns' harus berupa list.")
|
|
||||||
if not isinstance(payload["rows"], list):
|
|
||||||
raise TypeError("'rows' harus berupa list.")
|
|
||||||
|
|
||||||
for i, row in enumerate(payload["rows"]):
|
|
||||||
if len(row) != len(payload["columns"]):
|
|
||||||
raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.")
|
|
||||||
|
|
||||||
df = pd.DataFrame(payload["rows"], columns=payload["columns"])
|
|
||||||
|
|
||||||
if "title" in payload:
|
|
||||||
df.attrs["title"] = payload["title"]
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_read_pdf():
|
|
||||||
# single
|
|
||||||
# parsed = [{'title': 'Tabel 3.49. Potensi Penduduk Terpapar Bencana Banjir di Provinsi Jawa Timur', 'columns': ['No', 'Kabupaten/Kota', 'Jumlah Penduduk Terpapar (Jiwa)', 'Penduduk Umur Rentan', 'Penduduk Miskin', 'Penduduk Disabilitas', 'Kelas'], 'rows': [['1', 'PACITAN', '111.309', '14.142', '9.307', '781', 'SEDANG'], ['2', 'PONOROGO', '381.579', '50.815', '44.256', '2.346', 'SEDANG'], ['3', 'TRENGGALEK', '284.509', '34.304', '33.653', '1.945', 'SEDANG'], ['4', 'TULUNGAGUNG', '777.174', '86.452', '67.952', '3.200', 'SEDANG'], ['5', 'BLITAR', '226.767', '25.032', '22.554', '909', 'SEDANG'], ['6', 'KEDIRI', '545.961', '59.272', '74.578', '2.539', 'SEDANG'], ['7', 'MALANG', '238.170', '23.646', '25.388', '641', 'SEDANG'], ['8', 'LUMAJANG', '267.926', '30.206', '33.738', '970', 'SEDANG'], ['9', 'JEMBER', '1.061.703', '109.355', '105.958', '2.424', 'SEDANG'], ['10', 'BANYUWANGI', '442.290', '51.294', '44.107', '1.168', 'SEDANG'], ['11', 'BONDOWOSO', '143.452', '18.178', '21.676', '517', 'SEDANG'], ['12', 'SITUBONDO', '233.211', '26.799', '54.221', '928', 'SEDANG'], ['13', 'PROBOLINGGO', '326.005', '37.002', '58.562', '1.323', 'SEDANG'], ['14', 'PASURUAN', '485.143', '49.285', '65.076', '1.576', 'SEDANG'], ['15', 'SIDOARJO', '1.930.615', '172.191', '132.673', '3.987', 'SEDANG'], ['16', 'MOJOKERTO', '498.583', '52.453', '49.831', '1.491', 'SEDANG'], ['17', 'JOMBANG', '876.937', '92.415', '107.447', '4.985', 'SEDANG'], ['18', 'NGANJUK', '829.022', '95.454', '117.127', '3.029', 'SEDANG'], ['19', 'MADIUN', '363.763', '44.997', '44.877', '1.695', 'SEDANG'], ['20', 'MAGETAN', '117.247', '15.706', '11.051', '652', 'SEDANG'], ['21', 'NGAWI', '419.065', '49.864', '65.877', '1.572', 'SEDANG'], ['22', 'BOJONEGORO', '910.377', '100.800', '117.977', '3.557', 'SEDANG'], ['23', 'TUBAN', '507.407', '51.775', '60.834', '2.206', 'SEDANG'], ['24', 'LAMONGAN', '884.503', '99.928', '96.031', '3.960', 'SEDANG'], ['25', 'GRESIK', '613.133', '59.848', '49.854', '1.666', 'SEDANG'], ['26', 'BANGKALAN', '312.149', '31.075', '36.099', '1.169', 'SEDANG'], ['27', 'SAMPANG', '239.656', '28.756', '39.790', '1.280', 'SEDANG'], ['28', 'PAMEKASAN', '216.423', '25.831', '30.296', '776', 'SEDANG'], ['29', 'SUMENEP', '217.805', '24.741', '33.293', '1.088', 'SEDANG'], ['1', 'KOTA KEDIRI', '162.064', '17.129', '13.997', '363', 'SEDANG'], ['2', 'KOTA BLITAR', '21.390', '2.242', '1.185', '79', 'SEDANG'], ['3', 'KOTA MALANG', '148.072', '15.499', '6.142', '201', 'SEDANG'], ['4', 'KOTA PROBOLINGGO', '117.911', '12.708', '10.913', '420', 'SEDANG'], ['5', 'KOTA PASURUAN', '199.602', '20.199', '19.721', '516', 'SEDANG'], ['6', 'KOTA MOJOKERTO', '139.962', '14.486', '6.971', '584', 'SEDANG'], ['7', 'KOTA MADIUN', '149.468', '17.255', '6.300', '304', 'SEDANG'], ['8', 'KOTA SURABAYA', '2.469.639', '244.061', '133.953', '3.838', 'SEDANG'], ['9', 'KOTA BATU', '8.858', '939', '529', '13', 'SEDANG'], ['-', 'Provinsi Jawa Timur', '17.878.850', '1.906.134', '1.853.794', '60.698', 'SEDANG']]}]
|
|
||||||
|
|
||||||
# double
|
|
||||||
parsed = [{"title":"Luas Catchment Area (km2) Pada Wilayah Sungai di Provinsi Jawa Timur","columns":["Wilayah Sungai","Luas (km2)","Jumlah DAS"],"rows":[["Bengawan Solo","13.070,00","94 DAS"],["Brantas","13.880,00","20 DAS"],["Welang -Rejoso","2.601,00","36 DAS"],["Pekalen -Sampean","3.953,00","56 DAS"],["Baru -Bajulmati","3.675,00","60 DAS"],["Bondoyudo -Bedadung","5.364,00","47 DAS"],["Madura","4.575,00","173 DAS"]]},{"title":"Jumlah dan Kepadatan Penduduk Menurut Kabupaten\/kota di Provinsi Jawa Timur Tahun 2021","columns":["Kabupaten\/Kota","Jumlah Penduduk","Persentase","Kepadatan Penduduk (Jiwa per Km2)"],"rows":[["Bangkalan","1.082.759","2,64","1.081,20"],["Banyuwangi","1.749.773","4,27","302,60"],["Blitar","1.228.292","3,00","919,05"],["Bojonegoro","1.343.895","3,28","611,20"],["Bondowoso","801.541","1,96","525,27"],["Gresik","1.283.961","3,13","1.077,83"],["Jember","2.581.486","6,30","834,80"],["Jombang","1.350.483","3,29","1.211,10"],["Kediri","1.671.821","4,08","1.206,18"],["Lamongan","1.379.731","3,37","774,24"],["Lumajang","1.091.856","2,66","609,67"],["Madiun","754.263","1,84","726,94"],["Magetan","689.369","1,68","1.000,77"],["Malang","2.611.907","6,37","739,78"],["Mojokerto","1.126.540","2,75","1.569,37"],["Nganjuk","1.133.556","2,77","925,92"],["Ngawi","896.768","2,19","691,96"],["Pacitan","597.580","1,46","429,94"],["Pamekasan","840.790","2,05","1.061,28"],["Pasuruan","1.603.754","3,91","1.088,01"],["Ponorogo","968.681","2,36","741,89"],["Probolinggo","1.156.570","2,82","681,86"],["Sampang","902.514","2,20","731,92"],["Sidoarjo","1.951.723","4,76","3.076,58"],["Situbondo","666.245","1,63","398,98"],["Sumenep","1.134.750","2,77","567,79"],["Trenggalek","746.734","1,82","650,91"],["Tuban","1.223.257","2,98","666,93"],["Tulungagung","1.126.679","2,75","1.067,28"],["Kota Batu","215.248","0,53","1.574,14"],["Kota Blitar","158.123","0,39","4.854,87"],["Kota Kediri","292.363","0,71","4.611,40"],["Kota Madiun","201.243","0,49","6.045,15"],["Kota Malang","866.356","2,11","5.963,35"],["Kota Mojokerto","139.961","0,34","8.497,94"],["Kota Pasuruan","210.341","0,51","5.960,36"],["Kota Probolinggo","242.246","0,59","4.274,68"],["Kota Surabaya","2.970.843","7,25","8.475,05"],["Provinsi Jawa Timur","40.994.002","100,00","76.228,17"]]}]
|
|
||||||
# df = convert_df(parsed, table_index=0)
|
|
||||||
return parsed
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
116
services/upload_file/readers/reader_csv.py
Normal file
116
services/upload_file/readers/reader_csv.py
Normal file
|
|
@ -0,0 +1,116 @@
|
||||||
|
import pandas as pd
|
||||||
|
import re
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
|
||||||
|
def detect_header_line(path, max_rows=10):
|
||||||
|
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||||
|
lines = [next(f) for _ in range(max_rows)]
|
||||||
|
header_line_idx = 0
|
||||||
|
best_score = -1
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
cells = re.split(r'[;,|\t]', line.strip())
|
||||||
|
alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
|
||||||
|
digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
|
||||||
|
score = alpha_ratio - digit_ratio
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
header_line_idx = i
|
||||||
|
return header_line_idx
|
||||||
|
|
||||||
|
def detect_delimiter(path, sample_size=2048):
|
||||||
|
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||||
|
sample = f.read(sample_size)
|
||||||
|
sniffer = csv.Sniffer()
|
||||||
|
try:
|
||||||
|
dialect = sniffer.sniff(sample)
|
||||||
|
return dialect.delimiter
|
||||||
|
except Exception:
|
||||||
|
for delim in [',', ';', '\t', '|']:
|
||||||
|
if delim in sample:
|
||||||
|
return delim
|
||||||
|
return ','
|
||||||
|
|
||||||
|
|
||||||
|
def read_csv(path: str, sheet: str = None):
|
||||||
|
ext = os.path.splitext(path)[1].lower()
|
||||||
|
|
||||||
|
try:
|
||||||
|
if ext in ['.csv']:
|
||||||
|
header_line = detect_header_line(path)
|
||||||
|
delimiter = detect_delimiter(path)
|
||||||
|
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
|
||||||
|
|
||||||
|
df = pd.read_csv(
|
||||||
|
path,
|
||||||
|
header=header_line,
|
||||||
|
sep=delimiter,
|
||||||
|
encoding='utf-8',
|
||||||
|
low_memory=False,
|
||||||
|
thousands=','
|
||||||
|
)
|
||||||
|
|
||||||
|
elif ext in ['.xlsx', '.xls']:
|
||||||
|
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
|
||||||
|
xls = pd.ExcelFile(path)
|
||||||
|
print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
|
||||||
|
|
||||||
|
if sheet:
|
||||||
|
if sheet not in xls.sheet_names:
|
||||||
|
raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}")
|
||||||
|
print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
|
||||||
|
df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str)
|
||||||
|
df = df.dropna(how='all').dropna(axis=1, how='all')
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...")
|
||||||
|
best_sheet = None
|
||||||
|
best_score = -1
|
||||||
|
best_df = None
|
||||||
|
|
||||||
|
for sheet_name in xls.sheet_names:
|
||||||
|
try:
|
||||||
|
temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
|
||||||
|
temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all')
|
||||||
|
|
||||||
|
if len(temp_df) == 0 or len(temp_df.columns) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# hitung skor relevansi
|
||||||
|
text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1)
|
||||||
|
row_score = len(temp_df)
|
||||||
|
score = (row_score * 0.7) + (text_ratio * 100)
|
||||||
|
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_sheet = sheet_name
|
||||||
|
best_df = temp_df
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if best_df is not None:
|
||||||
|
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
|
||||||
|
df = best_df
|
||||||
|
else:
|
||||||
|
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
|
||||||
|
|
||||||
|
for col in df.columns:
|
||||||
|
if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
|
||||||
|
df[col] = df[col].astype(str).str.replace(',', '', regex=False)
|
||||||
|
df[col] = pd.to_numeric(df[col], errors='ignore')
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
|
||||||
|
df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
|
||||||
|
|
||||||
|
df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
|
||||||
|
df.columns = [str(c).strip() for c in df.columns]
|
||||||
|
df = df.dropna(how='all')
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
168
services/upload_file/readers/reader_pdf.py
Normal file
168
services/upload_file/readers/reader_pdf.py
Normal file
|
|
@ -0,0 +1,168 @@
|
||||||
|
import re
|
||||||
|
import pdfplumber
|
||||||
|
import pandas as pd
|
||||||
|
from services.upload_file.utils.pdf_cleaner import row_ratio, has_mixed_text_and_numbers, is_short_text_row, parse_page_selection, filter_geo_admin_column, cleaning_column
|
||||||
|
from services.upload_file.upload_exceptions import PDFReadError
|
||||||
|
from utils.logger_config import setup_logger
|
||||||
|
|
||||||
|
logger = setup_logger(__name__)
|
||||||
|
|
||||||
|
def detect_header_rows(rows):
|
||||||
|
if not rows:
|
||||||
|
return []
|
||||||
|
|
||||||
|
ratios = [row_ratio(r) for r in rows]
|
||||||
|
body_start_index = None
|
||||||
|
|
||||||
|
for i in range(1, len(rows)):
|
||||||
|
row = rows[i]
|
||||||
|
if has_mixed_text_and_numbers(row):
|
||||||
|
body_start_index = i
|
||||||
|
break
|
||||||
|
if ratios[i] > 0.3:
|
||||||
|
body_start_index = i
|
||||||
|
break
|
||||||
|
if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row):
|
||||||
|
body_start_index = i
|
||||||
|
break
|
||||||
|
if ratios[i - 1] == 0 and ratios[i] > 0:
|
||||||
|
body_start_index = i
|
||||||
|
break
|
||||||
|
|
||||||
|
if body_start_index is None:
|
||||||
|
body_start_index = len(rows)
|
||||||
|
|
||||||
|
potential_headers = rows[:body_start_index]
|
||||||
|
body_filtered = rows[body_start_index:]
|
||||||
|
header_filtered = []
|
||||||
|
for idx, row in enumerate(potential_headers):
|
||||||
|
if is_short_text_row(row):
|
||||||
|
if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0:
|
||||||
|
header_filtered.append(row)
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
header_filtered.append(row)
|
||||||
|
|
||||||
|
return header_filtered, body_filtered
|
||||||
|
|
||||||
|
|
||||||
|
def merge_multiline_header(header_rows):
|
||||||
|
final_header = []
|
||||||
|
for col in zip(*header_rows):
|
||||||
|
val = next((v for v in reversed(col) if v and str(v).strip()), '')
|
||||||
|
val = str(val).replace('\n', ' ').strip()
|
||||||
|
final_header.append(val)
|
||||||
|
final_header = [v for v in final_header if v not in ['', None]]
|
||||||
|
return final_header
|
||||||
|
|
||||||
|
|
||||||
|
def read_pdf(path: str, page: str):
|
||||||
|
"""
|
||||||
|
Membaca tabel dari file PDF secara semi-otomatis menggunakan `pdfplumber`.
|
||||||
|
|
||||||
|
Alur utama proses:
|
||||||
|
1. **Buka file PDF** menggunakan pdfplumber.
|
||||||
|
2. **Pilih halaman** berdasarkan input `page` (misalnya "1,3-5" untuk halaman 1 dan 3–5).
|
||||||
|
3. **Deteksi tabel** di setiap halaman yang dipilih.
|
||||||
|
4. **Ekstraksi tabel mentah** (list of list) dari setiap halaman.
|
||||||
|
5. **Pisahkan baris header dan body** dengan fungsi `detect_header_rows()`.
|
||||||
|
6. **Gabungkan header multi-baris** (misalnya tabel dengan dua baris judul kolom).
|
||||||
|
7. **Bersihkan body tabel** menggunakan `cleaning_column()`:
|
||||||
|
- Menghapus kolom nomor urut.
|
||||||
|
- Menyesuaikan jumlah kolom dengan header.
|
||||||
|
8. **Gabungkan hasil akhir** ke dalam format JSON dengan struktur:
|
||||||
|
{
|
||||||
|
"title": <nomor tabel>,
|
||||||
|
"columns": [...],
|
||||||
|
"rows": [...]
|
||||||
|
}
|
||||||
|
9. **Filter tambahan** dengan `filter_geo_admin_column()` (khusus metadata geospasial).
|
||||||
|
10. **Kembalikan hasil** berupa list JSON siap dikirim ke frontend API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): Lokasi file PDF yang akan dibaca.
|
||||||
|
page (str): Nomor halaman atau rentang halaman, contoh: "1", "2-4", "1,3-5".
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[dict]: Daftar tabel hasil ekstraksi dengan struktur kolom dan baris.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
PDFReadError: Jika terjadi kesalahan saat membaca atau parsing PDF.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
pdf_path = path
|
||||||
|
selectedPage = page if page else "1"
|
||||||
|
tables_data = []
|
||||||
|
|
||||||
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
total_pages = len(pdf.pages)
|
||||||
|
selected_pages = parse_page_selection(selectedPage, total_pages)
|
||||||
|
|
||||||
|
logger.info(f"[INFO] Total halaman PDF: {total_pages}")
|
||||||
|
logger.info(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
|
||||||
|
|
||||||
|
for page_num in selected_pages:
|
||||||
|
pdf_page = pdf.pages[page_num - 1]
|
||||||
|
tables = pdf_page.find_tables()
|
||||||
|
logger.info(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
|
||||||
|
|
||||||
|
for t in tables:
|
||||||
|
table = t.extract()
|
||||||
|
if len(table) > 2:
|
||||||
|
tables_data.append(table)
|
||||||
|
|
||||||
|
logger.info(f"\nTotal tabel valid: {len(tables_data)}\n")
|
||||||
|
|
||||||
|
header_only, body_only = [], []
|
||||||
|
for tbl in tables_data:
|
||||||
|
head, body = detect_header_rows(tbl)
|
||||||
|
header_only.append(head)
|
||||||
|
body_only.append(body)
|
||||||
|
|
||||||
|
clean_header = [merge_multiline_header(h) for h in header_only]
|
||||||
|
clean_body = []
|
||||||
|
|
||||||
|
for i, raw_body in enumerate(body_only):
|
||||||
|
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
|
||||||
|
cleaned = cleaning_column(clean_header[i], [con_body])
|
||||||
|
clean_body.append(cleaned[0])
|
||||||
|
|
||||||
|
parsed = []
|
||||||
|
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
|
||||||
|
parsed.append({
|
||||||
|
"title": str(i),
|
||||||
|
"columns": cols,
|
||||||
|
"rows": rows
|
||||||
|
})
|
||||||
|
|
||||||
|
clean_parsed = filter_geo_admin_column(parsed)
|
||||||
|
return clean_parsed
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise PDFReadError(f"Gagal membaca PDF: {e}", code=422)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_df(payload):
|
||||||
|
try:
|
||||||
|
if "columns" not in payload or "rows" not in payload:
|
||||||
|
raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.")
|
||||||
|
|
||||||
|
if not isinstance(payload["columns"], list):
|
||||||
|
raise TypeError("'columns' harus berupa list.")
|
||||||
|
if not isinstance(payload["rows"], list):
|
||||||
|
raise TypeError("'rows' harus berupa list.")
|
||||||
|
|
||||||
|
for i, row in enumerate(payload["rows"]):
|
||||||
|
if len(row) != len(payload["columns"]):
|
||||||
|
raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.")
|
||||||
|
|
||||||
|
df = pd.DataFrame(payload["rows"], columns=payload["columns"])
|
||||||
|
|
||||||
|
if "title" in payload:
|
||||||
|
df.attrs["title"] = payload["title"]
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise PDFReadError(f"Gagal konversi payload ke DataFrame: {e}", code=400)
|
||||||
|
|
@ -1,27 +1,30 @@
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import geopandas as gpd
|
import geopandas as gpd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import re
|
||||||
import zipfile
|
import zipfile
|
||||||
from shapely.geometry.base import BaseGeometry
|
from shapely.geometry.base import BaseGeometry
|
||||||
from shapely.geometry import base as shapely_base
|
from shapely.geometry import base as shapely_base
|
||||||
from fastapi import File, Form, UploadFile, HTTPException
|
from fastapi import File, Form, UploadFile, HTTPException
|
||||||
from fastapi.responses import JSONResponse
|
|
||||||
from core.config import UPLOAD_FOLDER, MAX_FILE_MB, VALID_WKT_PREFIXES
|
from core.config import UPLOAD_FOLDER, MAX_FILE_MB, VALID_WKT_PREFIXES
|
||||||
from services.upload_file.read_csv.reader_csv import read_csv
|
from services.upload_file.readers.reader_csv import read_csv
|
||||||
from services.upload_file.read_shp.reader_shp import read_shp
|
from services.upload_file.readers.reader_shp import read_shp
|
||||||
from services.upload_file.read_gdb.reader_gdb import read_gdb
|
from services.upload_file.readers.reader_gdb import read_gdb
|
||||||
from services.upload_file.read_mpk.reader_mpk import read_mpk
|
from services.upload_file.readers.reader_mpk import read_mpk
|
||||||
from services.upload_file.read_pdf.reader_pdf import convert_df, read_pdf
|
from services.upload_file.readers.reader_pdf import convert_df, read_pdf
|
||||||
from services.upload_file.geom_detector.geometry_detector import detect_and_build_geometry
|
from services.upload_file.utils.geometry_detector import detect_and_build_geometry
|
||||||
from services.upload_file.geom_detector.geometry_detector import attach_polygon_geometry_auto
|
from services.upload_file.utils.geometry_detector import attach_polygon_geometry_auto
|
||||||
from database.connection import engine
|
from database.connection import engine
|
||||||
from database.models import Base
|
from database.models import Base
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
from shapely import wkt
|
from shapely import wkt
|
||||||
from sqlalchemy import text
|
from sqlalchemy import text
|
||||||
Base.metadata.create_all(bind=engine)
|
from datetime import datetime
|
||||||
|
from response import successRes, errorRes
|
||||||
|
# Base.metadata.create_all(bind=engine)
|
||||||
|
|
||||||
|
|
||||||
def is_geom_empty(g):
|
def is_geom_empty(g):
|
||||||
|
|
@ -78,7 +81,7 @@ def process_data(df: pd.DataFrame, ext: str):
|
||||||
print(f"[INFO] Tipe Geometry: {geom_type}")
|
print(f"[INFO] Tipe Geometry: {geom_type}")
|
||||||
print(f"[INFO] Jumlah geometry kosong: {null_geom}")
|
print(f"[INFO] Jumlah geometry kosong: {null_geom}")
|
||||||
else:
|
else:
|
||||||
response = {
|
res = {
|
||||||
"message": "Tidak menemukan tabel yang relevan.",
|
"message": "Tidak menemukan tabel yang relevan.",
|
||||||
"file_type": ext,
|
"file_type": ext,
|
||||||
"rows": 0,
|
"rows": 0,
|
||||||
|
|
@ -91,7 +94,7 @@ def process_data(df: pd.DataFrame, ext: str):
|
||||||
"preview": []
|
"preview": []
|
||||||
}
|
}
|
||||||
|
|
||||||
return JSONResponse(content=response)
|
return errorRes(message="Tidak berhasil mencocokan geometry pada tabel." ,details=res, status_code=422)
|
||||||
|
|
||||||
result = result.replace([pd.NA, float('inf'), float('-inf')], None)
|
result = result.replace([pd.NA, float('inf'), float('-inf')], None)
|
||||||
if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns:
|
if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns:
|
||||||
|
|
@ -116,7 +119,8 @@ def process_data(df: pd.DataFrame, ext: str):
|
||||||
else:
|
else:
|
||||||
warning_examples = []
|
warning_examples = []
|
||||||
|
|
||||||
preview_data = result.to_dict(orient="records")
|
preview_data = result.head(10).to_dict(orient="records")
|
||||||
|
# preview_data = result.to_dict(orient="records")
|
||||||
|
|
||||||
preview_safe = [
|
preview_safe = [
|
||||||
{k: safe_json(v) for k, v in row.items()} for row in preview_data
|
{k: safe_json(v) for k, v in row.items()} for row in preview_data
|
||||||
|
|
@ -139,7 +143,7 @@ def process_data(df: pd.DataFrame, ext: str):
|
||||||
"preview": preview_safe
|
"preview": preview_safe
|
||||||
}
|
}
|
||||||
|
|
||||||
# return JSONResponse(content=response)
|
# return successRes(content=response)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -157,7 +161,7 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] =
|
||||||
contents = await file.read()
|
contents = await file.read()
|
||||||
size_mb = len(contents) / (1024*1024)
|
size_mb = len(contents) / (1024*1024)
|
||||||
if size_mb > MAX_FILE_MB:
|
if size_mb > MAX_FILE_MB:
|
||||||
raise HTTPException(status_code=413, detail="Ukuran File Terlalu Besar")
|
raise errorRes(status_code=413, message="Ukuran File Terlalu Besar")
|
||||||
tmp_path = UPLOAD_FOLDER / fname
|
tmp_path = UPLOAD_FOLDER / fname
|
||||||
with open(tmp_path, "wb") as f:
|
with open(tmp_path, "wb") as f:
|
||||||
f.write(contents)
|
f.write(contents)
|
||||||
|
|
@ -174,19 +178,19 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] =
|
||||||
elif ext == ".pdf":
|
elif ext == ".pdf":
|
||||||
tbl = read_pdf(tmp_path, page)
|
tbl = read_pdf(tmp_path, page)
|
||||||
if len(tbl) == 0:
|
if len(tbl) == 0:
|
||||||
response = {
|
res = {
|
||||||
"message": "Tidak ditemukan tabel valid",
|
"message": "Tidak ditemukan tabel valid",
|
||||||
"tables": tbl,
|
"tables": {},
|
||||||
"file_type": ext
|
"file_type": ext
|
||||||
}
|
}
|
||||||
return JSONResponse(content=response)
|
return successRes(message="Tidak ditemukan tabel valid", data=res)
|
||||||
elif len(tbl) > 1:
|
elif len(tbl) > 1:
|
||||||
response = {
|
res = {
|
||||||
"message": "File berhasil dibaca dan dianalisis.",
|
"message": "File berhasil dibaca dan dianalisis.",
|
||||||
"tables": tbl,
|
"tables": tbl,
|
||||||
"file_type": ext
|
"file_type": ext
|
||||||
}
|
}
|
||||||
return JSONResponse(content=response)
|
return successRes(data=res, message="File berhasil dibaca dan dianalisis.")
|
||||||
else:
|
else:
|
||||||
df = convert_df(tbl[0])
|
df = convert_df(tbl[0])
|
||||||
elif ext == ".zip":
|
elif ext == ".zip":
|
||||||
|
|
@ -201,25 +205,29 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] =
|
||||||
df = read_gdb(str(tmp_path))
|
df = read_gdb(str(tmp_path))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise HTTPException(
|
raise errorRes(
|
||||||
status_code=400,
|
status_code=400,
|
||||||
detail="ZIP file tidak mengandung SHP atau GDB yang valid."
|
message="ZIP file tidak mengandung SHP atau GDB yang valid."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise HTTPException(status_code=400, detail="Unsupported file type")
|
raise errorRes(status_code=400, message="Unsupported file type")
|
||||||
|
|
||||||
if df is None or (hasattr(df, "empty") and df.empty):
|
if df is None or (hasattr(df, "empty") and df.empty):
|
||||||
return JSONResponse({"error": "No valid table detected"}, status_code=400)
|
return successRes(message="File berhasil dibaca, Tetapi tidak ditemukan tabel valid")
|
||||||
|
|
||||||
res = process_data(df, ext)
|
res = process_data(df, ext)
|
||||||
|
|
||||||
tmp_path.unlink(missing_ok=True)
|
tmp_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
return JSONResponse(content=res)
|
return successRes(data=res)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[ERROR] {e}")
|
print(f"[ERROR] {e}")
|
||||||
return JSONResponse({"error": str(e)}, status_code=500)
|
return errorRes(
|
||||||
|
message="Internal Server Error",
|
||||||
|
details=str(e),
|
||||||
|
status_code=500
|
||||||
|
)
|
||||||
|
|
||||||
# finally:
|
# finally:
|
||||||
# db_session.close()
|
# db_session.close()
|
||||||
|
|
@ -237,16 +245,15 @@ async def handle_process_pdf(payload: PdfRequest):
|
||||||
try:
|
try:
|
||||||
df = convert_df(payload.model_dump())
|
df = convert_df(payload.model_dump())
|
||||||
if df is None or (hasattr(df, "empty") and df.empty):
|
if df is None or (hasattr(df, "empty") and df.empty):
|
||||||
return JSONResponse({"error": "No valid table detected"}, status_code=400)
|
return errorRes(message="Tidak ada tabel")
|
||||||
|
|
||||||
res = process_data(df, '.pdf')
|
res = process_data(df, '.pdf')
|
||||||
|
return successRes(data=res)
|
||||||
return JSONResponse(content=res)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[ERROR] {e}")
|
print(f"[ERROR] {e}")
|
||||||
|
|
||||||
return JSONResponse({"error": str(e)}, status_code=500)
|
return errorRes(message="Internal Server Error", details= str(e), status_code=500)
|
||||||
|
|
||||||
# finally:
|
# finally:
|
||||||
# db_session.close()
|
# db_session.close()
|
||||||
|
|
@ -263,39 +270,351 @@ class UploadRequest(BaseModel):
|
||||||
rows: List[dict]
|
rows: List[dict]
|
||||||
columns: List[str]
|
columns: List[str]
|
||||||
|
|
||||||
def handle_to_postgis(payload: UploadRequest):
|
# import time
|
||||||
try:
|
# def handle_to_postgis(payload: UploadRequest):
|
||||||
table_name = payload.title.lower().replace(" ", "_").replace("-","_")
|
# # time.sleep(2)
|
||||||
|
# # return {
|
||||||
|
# # "table_name": 'test',
|
||||||
|
# # "status": "success",
|
||||||
|
# # "message": f"Tabel test berhasil diunggah ke PostGIS.",
|
||||||
|
# # "total_rows": 999,
|
||||||
|
# # "geometry_type": 'POINT'
|
||||||
|
# # }
|
||||||
|
# try:
|
||||||
|
# table_name = payload.title.lower().replace(" ", "_").replace("-","_")
|
||||||
|
|
||||||
|
# df = pd.DataFrame(payload.rows)
|
||||||
|
# print(f"[INFO] Diterima {len(df)} baris data dari frontend.")
|
||||||
|
|
||||||
|
# if "geometry" in df.columns:
|
||||||
|
# df["geometry"] = df["geometry"].apply(
|
||||||
|
# lambda g: wkt.loads(g) if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) else None
|
||||||
|
# )
|
||||||
|
# gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
|
||||||
|
# else:
|
||||||
|
# raise HTTPException(status_code=400, detail="Kolom geometry tidak ditemukan dalam data.")
|
||||||
|
|
||||||
|
# with engine.begin() as conn:
|
||||||
|
# conn.execute(text(f"DROP TABLE IF EXISTS {table_name}"))
|
||||||
|
|
||||||
|
# gdf.to_postgis(table_name, engine, if_exists="replace", index=False)
|
||||||
|
|
||||||
|
# with engine.begin() as conn:
|
||||||
|
# conn.execute(text(f'ALTER TABLE "{table_name}" ADD COLUMN _id SERIAL PRIMARY KEY;'))
|
||||||
|
|
||||||
|
# print(f"[INFO] Tabel '{table_name}' berhasil dibuat di PostGIS ({len(gdf)} baris).")
|
||||||
|
|
||||||
|
# return {
|
||||||
|
# "table_name": table_name,
|
||||||
|
# "status": "success",
|
||||||
|
# "message": f"Tabel '{table_name}' berhasil diunggah ke PostGIS.",
|
||||||
|
# "total_rows": len(gdf),
|
||||||
|
# "geometry_type": list(gdf.geom_type.unique())
|
||||||
|
# }
|
||||||
|
|
||||||
|
# except Exception as e:
|
||||||
|
# print(f"[ERROR] Gagal upload ke PostGIS: {e}")
|
||||||
|
# raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# VALID_WKT_PREFIXES = ("POINT", "LINESTRING", "POLYGON", "MULTIPOLYGON", "MULTILINESTRING")
|
||||||
|
|
||||||
|
# def handle_to_postgis(payload: UploadRequest, user_id: int = 2):
|
||||||
|
# try:
|
||||||
|
# table_name = "test_partition"
|
||||||
|
# df = pd.DataFrame(payload.rows)
|
||||||
|
# print(f"[INFO] Diterima {len(df)} baris data dari frontend.")
|
||||||
|
|
||||||
|
# if "geometry" not in df.columns:
|
||||||
|
# raise HTTPException(status_code=400, detail="Kolom geometry tidak ditemukan dalam data.")
|
||||||
|
|
||||||
|
# df["geometry"] = df["geometry"].apply(
|
||||||
|
# lambda g: wkt.loads(g) if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) else None
|
||||||
|
# )
|
||||||
|
|
||||||
|
# gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
|
||||||
|
|
||||||
|
# insert_count = 0
|
||||||
|
# with engine.begin() as conn:
|
||||||
|
|
||||||
|
# # 💡 Tambahkan blok auto-create partisi di sini
|
||||||
|
# conn.execute(text(f"""
|
||||||
|
# DO $$
|
||||||
|
# BEGIN
|
||||||
|
# IF NOT EXISTS (
|
||||||
|
# SELECT 1 FROM pg_tables WHERE tablename = 'test_partition_user_{user_id}'
|
||||||
|
# ) THEN
|
||||||
|
# EXECUTE format('
|
||||||
|
# CREATE TABLE test_partition_user_%s
|
||||||
|
# PARTITION OF test_partition
|
||||||
|
# FOR VALUES IN (%s);
|
||||||
|
# ', {user_id}, {user_id});
|
||||||
|
# EXECUTE format('CREATE INDEX ON test_partition_user_%s USING GIST (geom);', {user_id});
|
||||||
|
# EXECUTE format('CREATE INDEX ON test_partition_user_%s USING GIN (properties);', {user_id});
|
||||||
|
# END IF;
|
||||||
|
# END
|
||||||
|
# $$;
|
||||||
|
# """))
|
||||||
|
|
||||||
|
# # Lanjut insert data seperti biasa
|
||||||
|
# for _, row in gdf.iterrows():
|
||||||
|
# geom_wkt = row["geometry"].wkt if row["geometry"] is not None else None
|
||||||
|
# properties = row.drop(labels=["geometry"]).to_dict()
|
||||||
|
|
||||||
|
# conn.execute(
|
||||||
|
# text("""
|
||||||
|
# INSERT INTO test_partition (user_id, geom, properties, created_at)
|
||||||
|
# VALUES (:user_id, ST_Force2D(ST_GeomFromText(:geom, 4326)), CAST(:properties AS jsonb), :created_at)
|
||||||
|
# """),
|
||||||
|
# {
|
||||||
|
# "user_id": user_id,
|
||||||
|
# "geom": geom_wkt,
|
||||||
|
# "properties": json.dumps(properties),
|
||||||
|
# "created_at": datetime.utcnow()
|
||||||
|
# }
|
||||||
|
# )
|
||||||
|
# insert_count += 1
|
||||||
|
|
||||||
|
# print(f"[INFO] Berhasil memasukkan {insert_count} baris ke partisi user_id={user_id}.")
|
||||||
|
|
||||||
|
# return {
|
||||||
|
# "table_name": table_name,
|
||||||
|
# "user_id": user_id,
|
||||||
|
# "status": "success",
|
||||||
|
# "message": f"Data berhasil ditambahkan ke partisi user_id={user_id}.",
|
||||||
|
# "total_rows": insert_count,
|
||||||
|
# "geometry_type": list(gdf.geom_type.unique())
|
||||||
|
# }
|
||||||
|
|
||||||
|
# except Exception as e:
|
||||||
|
# print(f"[ERROR] Gagal upload ke PostGIS partition: {e}")
|
||||||
|
# raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Daftar prefix WKT yang valid
|
||||||
|
VALID_WKT_PREFIXES = ("POINT", "LINESTRING", "POLYGON", "MULTIPOLYGON", "MULTILINESTRING")
|
||||||
|
|
||||||
|
|
||||||
|
def slugify(value: str) -> str:
|
||||||
|
"""Mengubah judul dataset jadi nama aman untuk VIEW"""
|
||||||
|
return re.sub(r'[^a-zA-Z0-9]+', '_', value.lower()).strip('_')
|
||||||
|
|
||||||
|
|
||||||
|
# async def create_dataset_view_from_metadata(conn, metadata_id: int, user_id: int, title: str):
|
||||||
|
# """Membuat VIEW PostgreSQL berdasarkan metadata dataset dan registrasi geometry untuk QGIS."""
|
||||||
|
# norm_title = slugify(title)
|
||||||
|
# view_name = f"v_user_{user_id}_{norm_title}"
|
||||||
|
# base_table = f"test_partition_user_{user_id}"
|
||||||
|
|
||||||
|
# # 1️⃣ Hapus view lama jika ada
|
||||||
|
# drop_query = text(f"DROP VIEW IF EXISTS {view_name} CASCADE;")
|
||||||
|
# await conn.execute(drop_query)
|
||||||
|
|
||||||
|
# # 2️⃣ Buat view baru
|
||||||
|
# create_view_query = text(f"""
|
||||||
|
# CREATE OR REPLACE VIEW {view_name} AS
|
||||||
|
# SELECT p.*, m.title, m.year, m.description
|
||||||
|
# FROM {base_table} p
|
||||||
|
# JOIN dataset_metadata m ON m.id = p.metadata_id
|
||||||
|
# WHERE p.metadata_id = {metadata_id};
|
||||||
|
# """)
|
||||||
|
# await conn.execute(create_view_query)
|
||||||
|
|
||||||
|
# # 3️⃣ Daftarkan geometry column agar QGIS mengenali layer ini
|
||||||
|
# # (gunakan Populate_Geometry_Columns jika PostGIS >= 3)
|
||||||
|
# populate_query = text(f"SELECT Populate_Geometry_Columns('{view_name}'::regclass);")
|
||||||
|
# await conn.execute(populate_query)
|
||||||
|
|
||||||
|
# print(f"[INFO] VIEW {view_name} berhasil dibuat dan geometry terdaftar.")
|
||||||
|
|
||||||
|
|
||||||
|
async def create_dataset_view_from_metadata(conn, metadata_id: int, user_id: int, title: str):
|
||||||
|
"""Buat VIEW dinamis sesuai struktur atribut JSON pada dataset (hindari duplikasi nama kolom)."""
|
||||||
|
norm_title = slugify(title)
|
||||||
|
view_name = f"v_user_{user_id}_{norm_title}"
|
||||||
|
base_table = f"test_partition_user_{user_id}"
|
||||||
|
|
||||||
|
# Ambil daftar field dari metadata
|
||||||
|
result = await conn.execute(text("SELECT fields FROM dataset_metadata WHERE id=:mid"), {"mid": metadata_id})
|
||||||
|
fields_json = result.scalar_one_or_none()
|
||||||
|
|
||||||
|
# --- daftar kolom bawaan dari tabel utama
|
||||||
|
base_columns = {"id", "user_id", "metadata_id", "geom"}
|
||||||
|
|
||||||
|
columns_sql = ""
|
||||||
|
field_list = []
|
||||||
|
|
||||||
|
if fields_json:
|
||||||
|
try:
|
||||||
|
# handle jika data sudah berupa list atau string JSON
|
||||||
|
if isinstance(fields_json, str):
|
||||||
|
field_list = json.loads(fields_json)
|
||||||
|
elif isinstance(fields_json, list):
|
||||||
|
field_list = fields_json
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Tipe data fields_json tidak dikenali: {type(fields_json)}")
|
||||||
|
|
||||||
|
for f in field_list:
|
||||||
|
safe_col = slugify(f)
|
||||||
|
# Hindari duplikat nama dengan kolom utama
|
||||||
|
if safe_col in base_columns:
|
||||||
|
alias_name = f"attr_{safe_col}"
|
||||||
|
else:
|
||||||
|
alias_name = safe_col
|
||||||
|
|
||||||
|
columns_sql += f", p.attributes->>'{f}' AS {alias_name}"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WARN] Gagal parse field list metadata: {e}")
|
||||||
|
|
||||||
|
# 1️⃣ Drop view lama
|
||||||
|
await conn.execute(text(f"DROP VIEW IF EXISTS {view_name} CASCADE;"))
|
||||||
|
|
||||||
|
# 2️⃣ Buat view baru dinamis
|
||||||
|
create_view_query = f"""
|
||||||
|
CREATE OR REPLACE VIEW {view_name} AS
|
||||||
|
SELECT p.id, p.user_id, p.metadata_id, p.geom
|
||||||
|
{columns_sql},
|
||||||
|
m.title, m.year, m.description
|
||||||
|
FROM {base_table} p
|
||||||
|
JOIN dataset_metadata m ON m.id = p.metadata_id
|
||||||
|
WHERE p.metadata_id = {metadata_id};
|
||||||
|
"""
|
||||||
|
await conn.execute(text(create_view_query))
|
||||||
|
|
||||||
|
# 3️⃣ Register geometry untuk QGIS
|
||||||
|
await conn.execute(text(f"SELECT Populate_Geometry_Columns('{view_name}'::regclass);"))
|
||||||
|
|
||||||
|
print(f"[INFO] VIEW {view_name} berhasil dibuat (kolom dinamis: {field_list if field_list else '(none)'}).")
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_to_postgis(payload, engine, user_id: int = 3):
|
||||||
|
"""
|
||||||
|
Menangani upload data spasial ke PostGIS (dengan partition per user).
|
||||||
|
- Jika partisi belum ada, akan dibuat otomatis
|
||||||
|
- Metadata dataset disimpan di tabel dataset_metadata
|
||||||
|
- Data spasial dimasukkan ke tabel partisi (test_partition_user_{id})
|
||||||
|
- VIEW otomatis dibuat untuk QGIS
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
df = pd.DataFrame(payload.rows)
|
df = pd.DataFrame(payload.rows)
|
||||||
print(f"[INFO] Diterima {len(df)} baris data dari frontend.")
|
print(f"[INFO] Diterima {len(df)} baris data dari frontend.")
|
||||||
|
|
||||||
if "geometry" in df.columns:
|
# --- Validasi kolom geometry ---
|
||||||
df["geometry"] = df["geometry"].apply(
|
if "geometry" not in df.columns:
|
||||||
lambda g: wkt.loads(g) if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) else None
|
raise errorRes(status_code=400, message="Kolom 'geometry' tidak ditemukan dalam data.")
|
||||||
|
|
||||||
|
# --- Parsing geometry ke objek shapely ---
|
||||||
|
df["geometry"] = df["geometry"].apply(
|
||||||
|
lambda g: wkt.loads(g)
|
||||||
|
if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES)
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Buat GeoDataFrame ---
|
||||||
|
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
|
||||||
|
|
||||||
|
# --- Metadata info dari payload ---
|
||||||
|
# dataset_title = getattr(payload, "dataset_title", None)
|
||||||
|
# dataset_year = getattr(payload, "dataset_year", None)
|
||||||
|
# dataset_desc = getattr(payload, "dataset_description", None)
|
||||||
|
dataset_title = "longsor 2020"
|
||||||
|
dataset_year = 2020
|
||||||
|
dataset_desc = "test metadata"
|
||||||
|
|
||||||
|
if not dataset_title:
|
||||||
|
raise errorRes(status_code=400, detail="Field 'dataset_title' wajib ada untuk metadata.")
|
||||||
|
|
||||||
|
async with engine.begin() as conn:
|
||||||
|
fields = [col for col in df.columns if col != "geometry"]
|
||||||
|
# 💾 1️⃣ Simpan Metadata Dataset
|
||||||
|
print("[INFO] Menyimpan metadata dataset...")
|
||||||
|
result = await conn.execute(
|
||||||
|
text("""
|
||||||
|
INSERT INTO dataset_metadata (user_id, title, year, description, fields, created_at)
|
||||||
|
VALUES (:user_id, :title, :year, :desc, :fields, :created_at)
|
||||||
|
RETURNING id;
|
||||||
|
"""),
|
||||||
|
{
|
||||||
|
"user_id": user_id,
|
||||||
|
"title": dataset_title,
|
||||||
|
"year": dataset_year,
|
||||||
|
"desc": dataset_desc,
|
||||||
|
"fields": json.dumps(fields),
|
||||||
|
"created_at": datetime.utcnow(),
|
||||||
|
},
|
||||||
)
|
)
|
||||||
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
|
metadata_id = result.scalar_one()
|
||||||
else:
|
print(f"[INFO] Metadata disimpan dengan ID {metadata_id}")
|
||||||
raise HTTPException(status_code=400, detail="Kolom geometry tidak ditemukan dalam data.")
|
|
||||||
|
|
||||||
with engine.begin() as conn:
|
# ⚙️ 2️⃣ Auto-create Partisi Jika Belum Ada
|
||||||
conn.execute(text(f"DROP TABLE IF EXISTS {table_name}"))
|
print(f"[INFO] Memastikan partisi test_partition_user_{user_id} tersedia...")
|
||||||
|
await conn.execute(
|
||||||
|
text(f"""
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_tables WHERE tablename = 'test_partition_user_{user_id}'
|
||||||
|
) THEN
|
||||||
|
EXECUTE format('
|
||||||
|
CREATE TABLE test_partition_user_%s
|
||||||
|
PARTITION OF test_partition
|
||||||
|
FOR VALUES IN (%s);
|
||||||
|
', {user_id}, {user_id});
|
||||||
|
EXECUTE format('CREATE INDEX IF NOT EXISTS idx_partition_user_%s_geom ON test_partition_user_%s USING GIST (geom);', {user_id}, {user_id});
|
||||||
|
EXECUTE format('CREATE INDEX IF NOT EXISTS idx_partition_user_%s_metadata ON test_partition_user_%s (metadata_id);', {user_id}, {user_id});
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
""")
|
||||||
|
)
|
||||||
|
|
||||||
gdf.to_postgis(table_name, engine, if_exists="replace", index=False)
|
# 🧩 3️⃣ Insert Data Spasial ke Partisi
|
||||||
|
print(f"[INFO] Memasukkan data ke test_partition_user_{user_id} ...")
|
||||||
|
insert_count = 0
|
||||||
|
for _, row in gdf.iterrows():
|
||||||
|
geom_wkt = row["geometry"].wkt if row["geometry"] is not None else None
|
||||||
|
attributes = row.drop(labels=["geometry"]).to_dict()
|
||||||
|
|
||||||
with engine.begin() as conn:
|
await conn.execute(
|
||||||
conn.execute(text(f'ALTER TABLE "{table_name}" ADD COLUMN _id SERIAL PRIMARY KEY;'))
|
text("""
|
||||||
|
INSERT INTO test_partition (user_id, metadata_id, geom, attributes, created_at)
|
||||||
|
VALUES (:user_id, :metadata_id, ST_Force2D(ST_GeomFromText(:geom, 4326)),
|
||||||
|
CAST(:attr AS jsonb), :created_at);
|
||||||
|
"""),
|
||||||
|
{
|
||||||
|
"user_id": user_id,
|
||||||
|
"metadata_id": metadata_id,
|
||||||
|
"geom": geom_wkt,
|
||||||
|
"attr": json.dumps(attributes),
|
||||||
|
"created_at": datetime.utcnow(),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
insert_count += 1
|
||||||
|
|
||||||
print(f"[INFO] Tabel '{table_name}' berhasil dibuat di PostGIS ({len(gdf)} baris).")
|
# 🧩 4️⃣ Membuat VIEW untuk dataset baru di QGIS
|
||||||
|
await create_dataset_view_from_metadata(conn, metadata_id, user_id, dataset_title)
|
||||||
|
|
||||||
|
print(f"[INFO] ✅ Berhasil memasukkan {insert_count} baris ke partisi user_id={user_id} (metadata_id={metadata_id}).")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"table_name": table_name,
|
|
||||||
"status": "success",
|
"status": "success",
|
||||||
"message": f"Tabel '{table_name}' berhasil diunggah ke PostGIS.",
|
"user_id": user_id,
|
||||||
"total_rows": len(gdf),
|
"metadata_id": metadata_id,
|
||||||
"geometry_type": list(gdf.geom_type.unique())
|
"dataset_title": dataset_title,
|
||||||
|
"inserted_rows": insert_count,
|
||||||
|
"geometry_type": list(gdf.geom_type.unique()),
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[ERROR] Gagal upload ke PostGIS: {e}")
|
print(f"[ERROR] Gagal upload ke PostGIS partition: {e}")
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
raise errorRes(status_code=500, message="Gagal upload ke PostGIS partition", details=str(e))
|
||||||
9
services/upload_file/upload_exceptions.py
Normal file
9
services/upload_file/upload_exceptions.py
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
class PDFReadError(Exception):
|
||||||
|
"""Exception khusus untuk kesalahan saat membaca file PDF."""
|
||||||
|
def __init__(self, message: str, code: int = 400):
|
||||||
|
super().__init__(message)
|
||||||
|
self.message = message
|
||||||
|
self.code = code
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {"error": self.message, "code": self.code}
|
||||||
Binary file not shown.
Binary file not shown.
159
services/upload_file/utils/pdf_cleaner.py
Normal file
159
services/upload_file/utils/pdf_cleaner.py
Normal file
|
|
@ -0,0 +1,159 @@
|
||||||
|
import re
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
geo_admin_keywords = [
|
||||||
|
'lat', 'lon', 'long', 'latitude', 'longitude', 'koordinat', 'geometry', 'geometri',
|
||||||
|
'desa', 'kelurahan', 'kel', 'kecamatan', 'kabupaten', 'kab', 'kota', 'provinsi',
|
||||||
|
'lokasi', 'region', 'area', 'zone', 'boundary', 'batas'
|
||||||
|
]
|
||||||
|
|
||||||
|
def normalize_text(text):
|
||||||
|
text = text.lower()
|
||||||
|
text = re.sub(r'[^a-z0-9/ ]+', ' ', text)
|
||||||
|
text = re.sub(r'\s+', ' ', text).strip()
|
||||||
|
return text
|
||||||
|
|
||||||
|
def generate_combined_patterns(keywords):
|
||||||
|
combos = list(itertools.combinations(keywords, 2))
|
||||||
|
patterns = []
|
||||||
|
for a, b in combos:
|
||||||
|
patterns.append(rf'{a}\s*/\s*{b}')
|
||||||
|
patterns.append(rf'{b}\s*/\s*{a}')
|
||||||
|
return patterns
|
||||||
|
|
||||||
|
combined_patterns = generate_combined_patterns(geo_admin_keywords)
|
||||||
|
|
||||||
|
def contains_geo_admin_keywords(text):
|
||||||
|
text_clean = normalize_text(text)
|
||||||
|
if len(text_clean) < 3:
|
||||||
|
return False
|
||||||
|
|
||||||
|
for pattern in combined_patterns:
|
||||||
|
if re.search(pattern, text_clean):
|
||||||
|
return True
|
||||||
|
|
||||||
|
for kw in geo_admin_keywords:
|
||||||
|
if re.search(rf'(^|[\s/_-]){kw}([\s/_-]|$)', text_clean):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def filter_geo_admin_column(tables):
|
||||||
|
filtered = []
|
||||||
|
for table in tables:
|
||||||
|
found = any(contains_geo_admin_keywords(col) for col in table['columns'])
|
||||||
|
if found:
|
||||||
|
filtered.append(table)
|
||||||
|
return filtered
|
||||||
|
|
||||||
|
|
||||||
|
NUMBER_HEADER_KEYWORDS = [
|
||||||
|
"no","no.","nomor","nomor urut","no urut","No","Nomor","No Urut","Index",
|
||||||
|
"ID","Sr No","S/N","SN","Sl No"
|
||||||
|
]
|
||||||
|
|
||||||
|
def has_number_header(header):
|
||||||
|
header_text = header
|
||||||
|
return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS)
|
||||||
|
|
||||||
|
def is_numbering_column(col_values):
|
||||||
|
numeric_like = 0
|
||||||
|
total = 0
|
||||||
|
for v in col_values:
|
||||||
|
if not v or not isinstance(v, str):
|
||||||
|
continue
|
||||||
|
total += 1
|
||||||
|
if re.fullmatch(r"0*\d{1,3}", v.strip()):
|
||||||
|
numeric_like += 1
|
||||||
|
return total > 0 and (numeric_like / total) > 0.6
|
||||||
|
|
||||||
|
def is_numeric_value(v):
|
||||||
|
if v is None:
|
||||||
|
return False
|
||||||
|
if isinstance(v, (int, float)):
|
||||||
|
return True
|
||||||
|
if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def cleaning_column(headers, bodies):
|
||||||
|
cleaned_bodies = []
|
||||||
|
|
||||||
|
for header, body in zip(headers, bodies):
|
||||||
|
if not body:
|
||||||
|
cleaned_bodies.append(body)
|
||||||
|
continue
|
||||||
|
|
||||||
|
header_has_number = has_number_header(header)
|
||||||
|
first_col = [row[0] for row in body if row and len(row) > 0]
|
||||||
|
first_col_is_numbering = is_numbering_column(first_col)
|
||||||
|
|
||||||
|
if not header_has_number and first_col_is_numbering:
|
||||||
|
new_body = []
|
||||||
|
for row in body:
|
||||||
|
if not row:
|
||||||
|
continue
|
||||||
|
first_val = row[0]
|
||||||
|
if is_numeric_value(first_val) and len(row) > 1:
|
||||||
|
new_body.append(row[1:])
|
||||||
|
else:
|
||||||
|
new_body.append(row)
|
||||||
|
body = new_body
|
||||||
|
|
||||||
|
header_len = len(headers)
|
||||||
|
filtered_body = [row for row in body if len(row) == header_len]
|
||||||
|
|
||||||
|
cleaned_bodies.append(filtered_body)
|
||||||
|
|
||||||
|
return cleaned_bodies
|
||||||
|
|
||||||
|
def parse_page_selection(selectedPage: str, total_pages: int):
|
||||||
|
if not selectedPage:
|
||||||
|
return list(range(1, total_pages + 1))
|
||||||
|
|
||||||
|
pages = set()
|
||||||
|
parts = re.split(r'[,\s]+', selectedPage.strip())
|
||||||
|
|
||||||
|
for part in parts:
|
||||||
|
if '-' in part:
|
||||||
|
try:
|
||||||
|
start, end = map(int, part.split('-'))
|
||||||
|
pages.update(range(start, end + 1))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
pages.add(int(part))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages]
|
||||||
|
return valid_pages
|
||||||
|
|
||||||
|
def is_number(s):
|
||||||
|
if s is None:
|
||||||
|
return False
|
||||||
|
s = str(s).strip().replace(',', '').replace('.', '')
|
||||||
|
return s.isdigit()
|
||||||
|
|
||||||
|
def row_ratio(row):
|
||||||
|
non_empty = [c for c in row if c not in (None, '', ' ')]
|
||||||
|
if not non_empty:
|
||||||
|
return 0
|
||||||
|
num_count = sum(is_number(c) for c in non_empty)
|
||||||
|
return num_count / len(non_empty)
|
||||||
|
|
||||||
|
def has_mixed_text_and_numbers(row):
|
||||||
|
non_empty = [c for c in row if c not in (None, '', ' ')]
|
||||||
|
has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty)
|
||||||
|
has_num = any(is_number(c) for c in non_empty)
|
||||||
|
return has_text and has_num
|
||||||
|
|
||||||
|
def is_short_text_row(row):
|
||||||
|
"""Deteksi baris teks pendek (1-2 kolom teks pendek)."""
|
||||||
|
non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')]
|
||||||
|
if not non_empty:
|
||||||
|
return False
|
||||||
|
text_only = all(not is_number(c) for c in non_empty)
|
||||||
|
joined = " ".join(non_empty)
|
||||||
|
return text_only and len(non_empty) <= 2 and len(joined) < 20
|
||||||
32
utils/logger_config.py
Normal file
32
utils/logger_config.py
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
LOG_DIR = "logs"
|
||||||
|
os.makedirs(LOG_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
def setup_logger(name: str):
|
||||||
|
"""
|
||||||
|
Konfigurasi logger standar untuk seluruh service.
|
||||||
|
Format log:
|
||||||
|
[LEVEL] [Nama Modul] Pesan
|
||||||
|
"""
|
||||||
|
logger = logging.getLogger(name)
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
# Handler untuk menulis ke file
|
||||||
|
file_handler = logging.FileHandler(os.path.join(LOG_DIR, "app.log"))
|
||||||
|
file_handler.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
# Handler untuk console (stdout)
|
||||||
|
console_handler = logging.StreamHandler()
|
||||||
|
console_handler.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
formatter = logging.Formatter('[%(levelname)s] [%(name)s] %(message)s')
|
||||||
|
file_handler.setFormatter(formatter)
|
||||||
|
console_handler.setFormatter(formatter)
|
||||||
|
|
||||||
|
if not logger.handlers:
|
||||||
|
logger.addHandler(file_handler)
|
||||||
|
logger.addHandler(console_handler)
|
||||||
|
|
||||||
|
return logger
|
||||||
30
utils/qgis_init.py
Normal file
30
utils/qgis_init.py
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
# utils/qgis_init.py
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Lokasi instalasi QGIS di Linux (Ubuntu / Debian)
|
||||||
|
QGIS_PREFIX = "/usr"
|
||||||
|
|
||||||
|
# Path modul Python QGIS
|
||||||
|
sys.path.append("/usr/share/qgis/python")
|
||||||
|
|
||||||
|
# Environment variable agar QGIS dapat berjalan headless (tanpa GUI)
|
||||||
|
os.environ["QGIS_PREFIX_PATH"] = QGIS_PREFIX
|
||||||
|
os.environ["QT_QPA_PLATFORM"] = "offscreen"
|
||||||
|
|
||||||
|
from qgis.core import QgsApplication
|
||||||
|
from qgis.analysis import QgsNativeAlgorithms
|
||||||
|
import processing
|
||||||
|
from processing.core.Processing import Processing
|
||||||
|
|
||||||
|
|
||||||
|
def init_qgis():
|
||||||
|
qgs = QgsApplication([], False)
|
||||||
|
qgs.initQgis()
|
||||||
|
|
||||||
|
# Register QGIS processing provider
|
||||||
|
Processing.initialize()
|
||||||
|
QgsApplication.processingRegistry().addProvider(QgsNativeAlgorithms())
|
||||||
|
|
||||||
|
print("QGIS initialized successfully")
|
||||||
|
return qgs
|
||||||
Loading…
Reference in New Issue
Block a user