testing qgis server

This commit is contained in:
DmsAnhr 2025-11-17 10:53:15 +07:00
parent 27ee77331c
commit f4113e7f0d
44 changed files with 1148 additions and 696 deletions

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,34 @@
from fastapi import Depends, Header
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from datetime import datetime
from response import errorRes
from database.connection import SessionLocal
from database.models import User
async def get_db():
async with SessionLocal() as session:
yield session
async def get_current_user(
authorization: str = Header(None),
db: AsyncSession = Depends(get_db)
):
if not authorization or not authorization.startswith("Bearer "):
raise errorRes(status_code=401, message="Missing or invalid token")
token = authorization.split(" ")[1]
result = await db.execute(select(User).where(User.active_token == token))
user = result.scalar_one_or_none()
# Case 1: Token not found → maybe replaced by new login
if not user:
raise errorRes(status_code=401, message="Token invalid or used by another login")
# Case 2: Token expired
if user.token_expired_at and user.token_expired_at < datetime.utcnow():
raise errorRes(status_code=401, message="Token expired")
return user

View File

@ -0,0 +1,20 @@
from fastapi import Depends, status
from api.deps.auth_dependency import get_current_user
from response import errorRes
def require_role(required_role: str):
"""
Return a dependency function that ensures the current user has a specific role.
Example usage:
@router.get("/admin", dependencies=[Depends(require_role("admin"))])
"""
async def role_checker(user = Depends(get_current_user)):
if user.role != required_role:
raise errorRes(
status_code=status.HTTP_403_FORBIDDEN,
message="Access denied",
detail=f"Access denied: requires role '{required_role}'",
)
return user
return role_checker

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,14 @@
from fastapi import APIRouter, Depends
from pydantic import BaseModel
from sqlalchemy.ext.asyncio import AsyncSession
from services.auth.login import loginService, get_db
router = APIRouter()
class LoginRequest(BaseModel):
username: str
password: str
@router.post("/login")
async def login(request: LoginRequest, db: AsyncSession = Depends(get_db)):
return await loginService(request.username, request.password, db)

View File

@ -0,0 +1,20 @@
from fastapi import APIRouter
from core.config import engine
from services.datasets.delete import delete_dataset_from_partition # import fungsi di atas
from response import successRes, errorRes
router = APIRouter()
@router.delete("/dataset/{user_id}/{metadata_id}")
async def delete_dataset(user_id: int, metadata_id: int, title: str):
"""
Hapus dataset tertentu (berdasarkan user_id dan metadata_id)
"""
try:
async with engine.begin() as conn:
await delete_dataset_from_partition(conn, user_id, metadata_id, title)
return successRes(message=f"Dataset {title} berhasil dihapus.", data="")
except Exception as e:
print(f"[ERROR] Gagal hapus dataset: {e}")
raise errorRes(status_code=500, details=str(e), message="Gagal hapus dataset")

View File

@ -1,13 +1,15 @@
from fastapi import APIRouter, File, Form, UploadFile from fastapi import APIRouter, File, Form, UploadFile, Depends
from pydantic import BaseModel from pydantic import BaseModel
from typing import List, Optional from typing import List, Optional
from services.upload_file.upload import handle_upload_file, handle_process_pdf, handle_to_postgis from services.upload_file.upload import handle_upload_file, handle_process_pdf, handle_to_postgis
from api.deps.role_dependency import require_role
from database.connection import engine
router = APIRouter() router = APIRouter()
@router.post("/file") @router.post("/file")
# async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form(""), user = Depends(require_role("admin"))):
async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form("")): async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form("")):
return await handle_upload_file(file, page, sheet) return await handle_upload_file(file, page, sheet)
@ -30,5 +32,5 @@ class UploadRequest(BaseModel):
columns: List[str] columns: List[str]
@router.post("/to-postgis") @router.post("/to-postgis")
def upload_to_postgis(payload: UploadRequest): async def upload_to_postgis(payload: UploadRequest):
return handle_to_postgis(payload) return await handle_to_postgis(payload, engine)

View File

@ -1,6 +1,9 @@
from sqlalchemy import create_engine from sqlalchemy import create_engine
from sqlalchemy.ext.asyncio import create_async_engine
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
from core.config import POSTGIS_URL from core.config import POSTGIS_URL
engine = create_engine(POSTGIS_URL, pool_pre_ping=True) engine = create_async_engine(POSTGIS_URL, pool_pre_ping=True)
SessionLocal = sessionmaker(bind=engine) # SessionLocal = sessionmaker(bind=engine)
SessionLocal = async_sessionmaker(engine, expire_on_commit=False)

View File

@ -1,4 +1,5 @@
from sqlalchemy import Column, Integer, String, Text, TIMESTAMP from sqlalchemy import Column, Integer, String, Text, ForeignKey, DateTime, TIMESTAMP
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.sql import func from sqlalchemy.sql import func
@ -14,3 +15,28 @@ class UploadLog(Base):
uploaded_at = Column(TIMESTAMP, server_default=func.now()) uploaded_at = Column(TIMESTAMP, server_default=func.now())
status = Column(String) status = Column(String)
message = Column(Text) message = Column(Text)
class Institution(Base):
__tablename__ = "institutions"
id = Column(Integer, primary_key=True, index=True)
name = Column(String(100), unique=True, nullable=False)
address = Column(String(200), nullable=True)
users = relationship("User", back_populates="institution")
class User(Base):
__tablename__ = "users"
id = Column(Integer, primary_key=True, index=True)
username = Column(String(50), unique=True, nullable=False)
password_hash = Column(String(255), nullable=False)
role = Column(String(50), nullable=False, default="user") # <── Added role
institution_id = Column(Integer, ForeignKey("institutions.id"), nullable=True)
active_token = Column(String(255), nullable=True)
token_expired_at = Column(DateTime, nullable=True)
last_login = Column(DateTime, nullable=True)
institution = relationship("Institution", back_populates="users")

View File

@ -1,16 +0,0 @@
import geopandas as gpd
import pandas as pd
from database.connection import engine
from sqlalchemy import text
def save_dataframe_dynamic(df: pd.DataFrame, table_name: str):
"""Save pandas DataFrame to Postgres (non-geo)."""
df.to_sql(table_name, engine, if_exists="replace", index=False, method='multi', chunksize=1000)
def save_geodataframe(gdf: gpd.GeoDataFrame, table_name: str):
"""Save GeoDataFrame to PostGIS (requires geoalchemy/geopandas)."""
# ensure geometry column exists and CRS set
if gdf.crs is None:
gdf = gdf.set_crs("EPSG:4326", allow_override=True)
# geopandas >= 0.10 has to_postgis in some installs; fallback using SQLAlchemy + GeoAlchemy2:
gdf.to_postgis(table_name, engine, if_exists="replace")

View File

@ -1,3 +0,0 @@
from database.connection import engine
from database.models import Base
Base.metadata.create_all(bind=engine)

38
main.py
View File

@ -3,9 +3,11 @@ from fastapi.middleware.cors import CORSMiddleware
from core.config import API_VERSION, ALLOWED_ORIGINS from core.config import API_VERSION, ALLOWED_ORIGINS
from database.connection import engine from database.connection import engine
from database.models import Base from database.models import Base
from routes.router import router as system_router from api.routers.system_router import router as system_router
from routes.upload_file_router import router as upload_router from api.routers.upload_file_router import router as upload_router
from routes.auth_router import router as auth_router from api.routers.auth_router import router as auth_router
from contextlib import asynccontextmanager
from utils.qgis_init import init_qgis
app = FastAPI( app = FastAPI(
title="ETL Geo Upload Service", title="ETL Geo Upload Service",
@ -21,7 +23,35 @@ app.add_middleware(
allow_headers=["*"], allow_headers=["*"],
) )
Base.metadata.create_all(bind=engine) # Base.metadata.create_all(bind=engine)
# qgis setup
@asynccontextmanager
async def lifespan(app: FastAPI):
global qgs
qgs = init_qgis()
print("QGIS initialized")
yield
# SHUTDOWN (optional)
print("Shutting down...")
app = FastAPI(lifespan=lifespan)
@app.get("/qgis/status")
def qgis_status():
try:
from qgis.core import Qgis
return {
"qgis_status": "connected",
"qgis_version": Qgis.QGIS_VERSION
}
except Exception as e:
return {
"qgis_status": "error",
"error": str(e)
}
# Register routers # Register routers
app.include_router(system_router, tags=["System"]) app.include_router(system_router, tags=["System"])

22
response.py Normal file
View File

@ -0,0 +1,22 @@
from fastapi import HTTPException
from fastapi.responses import JSONResponse
def successRes(data=None, message="Success", status_code=200):
return JSONResponse(
status_code=status_code,
content={
"status": "success",
"message": message,
"data": data,
}
)
def errorRes(message="Error", status_code=400, details=None):
return HTTPException(
status_code=status_code,
detail={
"status": "error",
"message": message,
"details": details
}
)

View File

@ -1,15 +0,0 @@
from fastapi import APIRouter
from core.config import API_VERSION
router = APIRouter()
@router.get("/login")
async def login():
return {"status": "success"}

BIN
services/.DS_Store vendored Normal file

Binary file not shown.

View File

49
services/auth/login.py Normal file
View File

@ -0,0 +1,49 @@
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from passlib.context import CryptContext
from uuid import uuid4
from datetime import datetime, timedelta
from database.connection import SessionLocal
from database.models import User
from response import successRes, errorRes
async def get_db():
async with SessionLocal() as session:
yield session
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
async def loginService(username: str, password: str, db: AsyncSession):
result = await db.execute(select(User).where(User.username == username))
user = result.scalar_one_or_none()
if not user:
raise errorRes(status_code=401, message="Invalid username or password")
# Verify password
if not pwd_context.verify(password, user.password_hash):
raise errorRes(status_code=401, message="Invalid username or password")
# Validation for institution user
if user.role != "admin" and not user.institution_id:
raise errorRes(status_code=403, message="User must belong to an institution")
# Generate single active token
token = str(uuid4())
expiry = datetime.utcnow() + timedelta(hours=4)
user.active_token = token
user.token_expired_at = expiry
user.last_login = datetime.utcnow()
await db.commit()
res = {
"status": "success",
"username": user.username,
"role": user.role,
"institution_id": user.institution_id,
"token": token,
"token_expired_at": expiry.isoformat()
}
return successRes(message="Success Login", data=res)

View File

@ -0,0 +1,33 @@
import re
from sqlalchemy import text
def slugify(value: str):
return re.sub(r'[^a-zA-Z0-9]+', '_', value.lower()).strip('_')
async def delete_dataset_from_partition(conn, user_id: int, metadata_id: int, title: str):
"""
Menghapus dataset tertentu pada partisi user tertentu.
- Hapus semua record di partisi (test_partition_user_{id})
- Hapus metadata dari dataset_metadata
- Hapus VIEW QGIS terkait
"""
base_table = f"test_partition_user_{user_id}"
norm_title = slugify(title)
view_name = f"v_user_{user_id}_{norm_title}"
print(f"[INFO] Menghapus dataset metadata_id={metadata_id} milik user_id={user_id}...")
# 1⃣ Hapus data spasial dari partisi
await conn.execute(text(f"DELETE FROM {base_table} WHERE metadata_id = :mid;"), {"mid": metadata_id})
print(f"[INFO] Data spasial dari partisi {base_table} (metadata_id={metadata_id}) dihapus.")
# 2⃣ Hapus metadata dari dataset_metadata
await conn.execute(text("DELETE FROM dataset_metadata WHERE id = :mid;"), {"mid": metadata_id})
print(f"[INFO] Metadata dataset id={metadata_id} dihapus dari tabel dataset_metadata.")
# 3⃣ Hapus view terkait di QGIS
await conn.execute(text(f"DROP VIEW IF EXISTS {view_name} CASCADE;"))
print(f"[INFO] View {view_name} dihapus (jika ada).")
print("[INFO] ✅ Penghapusan dataset selesai.")

BIN
services/upload_file/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -1,283 +0,0 @@
# import pandas as pd
# import re
# import csv
# import os
# def detect_header_line(path, max_rows=10):
# with open(path, 'r', encoding='utf-8', errors='ignore') as f:
# lines = [next(f) for _ in range(max_rows)]
# header_line_idx = 0
# best_score = -1
# for i, line in enumerate(lines):
# cells = re.split(r'[;,|\t]', line.strip())
# alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
# digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
# score = alpha_ratio - digit_ratio
# if score > best_score:
# best_score = score
# header_line_idx = i
# return header_line_idx
# def detect_delimiter(path, sample_size=2048):
# with open(path, 'r', encoding='utf-8', errors='ignore') as f:
# sample = f.read(sample_size)
# sniffer = csv.Sniffer()
# try:
# dialect = sniffer.sniff(sample)
# return dialect.delimiter
# except Exception:
# for delim in [',', ';', '\t', '|']:
# if delim in sample:
# return delim
# return ','
# def read_csv(path: str):
# ext = os.path.splitext(path)[1].lower() # ambil ekstensi file
# try:
# if ext in ['.csv', '.txt']:
# # === Baca file CSV ===
# header_line = detect_header_line(path)
# delimiter = detect_delimiter(path)
# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
# elif ext in ['.xlsx', '.xls']:
# # === Baca file Excel ===
# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
# pre_df = pd.read_excel(path, header=0, dtype=str) # baca semua sebagai string
# df = pre_df.copy()
# for col in df.columns:
# if df[col].str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
# df[col] = df[col].str.replace(',', '', regex=False)
# df[col] = pd.to_numeric(df[col], errors='ignore')
# else:
# raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)")
# except Exception as e:
# print(f"[WARN] Gagal membaca file ({e}), fallback ke default")
# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
# # Bersihkan kolom dan baris kosong
# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
# df.columns = [str(c).strip() for c in df.columns]
# df = df.dropna(how='all')
# return df
import pandas as pd
import re
import csv
import os
def detect_header_line(path, max_rows=10):
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
lines = [next(f) for _ in range(max_rows)]
header_line_idx = 0
best_score = -1
for i, line in enumerate(lines):
cells = re.split(r'[;,|\t]', line.strip())
alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
score = alpha_ratio - digit_ratio
if score > best_score:
best_score = score
header_line_idx = i
return header_line_idx
def detect_delimiter(path, sample_size=2048):
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
sample = f.read(sample_size)
sniffer = csv.Sniffer()
try:
dialect = sniffer.sniff(sample)
return dialect.delimiter
except Exception:
for delim in [',', ';', '\t', '|']:
if delim in sample:
return delim
return ','
# def read_csv(path: str):
# ext = os.path.splitext(path)[1].lower()
# try:
# if ext in ['.csv']:
# # === Baca file CSV ===
# header_line = detect_header_line(path)
# delimiter = detect_delimiter(path)
# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
# elif ext in ['.xlsx', '.xls']:
# # === Baca file Excel ===
# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
# xls = pd.ExcelFile(path)
# print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
# # Evaluasi tiap sheet untuk mencari yang paling relevan
# best_sheet = None
# best_score = -1
# best_df = None
# for sheet_name in xls.sheet_names:
# try:
# df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
# df = df.dropna(how='all').dropna(axis=1, how='all')
# if len(df) == 0 or len(df.columns) < 2:
# continue
# # hitung "skor relevansi"
# text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1)
# row_score = len(df)
# score = (row_score * 0.7) + (text_ratio * 100)
# if score > best_score:
# best_score = score
# best_sheet = sheet_name
# best_df = df
# except Exception as e:
# print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
# continue
# if best_df is not None:
# print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
# df = best_df
# else:
# raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
# # Konversi tipe numerik jika ada
# for col in df.columns:
# if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
# df[col] = df[col].astype(str).str.replace(',', '', regex=False)
# df[col] = pd.to_numeric(df[col], errors='ignore')
# else:
# raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
# except Exception as e:
# print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
# # Bersihkan kolom dan baris kosong
# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
# df.columns = [str(c).strip() for c in df.columns]
# df = df.dropna(how='all')
# return df
def read_csv(path: str, sheet: str = None):
ext = os.path.splitext(path)[1].lower()
try:
if ext in ['.csv']:
# === Baca file CSV ===
header_line = detect_header_line(path)
delimiter = detect_delimiter(path)
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
df = pd.read_csv(
path,
header=header_line,
sep=delimiter,
encoding='utf-8',
low_memory=False,
thousands=','
)
elif ext in ['.xlsx', '.xls']:
# === Baca file Excel ===
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
xls = pd.ExcelFile(path)
print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
# === Jika user memberikan nama sheet ===
if sheet:
if sheet not in xls.sheet_names:
raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}")
print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str)
df = df.dropna(how='all').dropna(axis=1, how='all')
else:
# === Auto-detect sheet terbaik ===
print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...")
best_sheet = None
best_score = -1
best_df = None
for sheet_name in xls.sheet_names:
try:
temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all')
if len(temp_df) == 0 or len(temp_df.columns) < 2:
continue
# hitung skor relevansi
text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1)
row_score = len(temp_df)
score = (row_score * 0.7) + (text_ratio * 100)
if score > best_score:
best_score = score
best_sheet = sheet_name
best_df = temp_df
except Exception as e:
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
continue
if best_df is not None:
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
df = best_df
else:
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
# Konversi tipe numerik jika ada
for col in df.columns:
if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
df[col] = df[col].astype(str).str.replace(',', '', regex=False)
df[col] = pd.to_numeric(df[col], errors='ignore')
else:
raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
except Exception as e:
print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
# Bersihkan kolom dan baris kosong
df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
df.columns = [str(c).strip() for c in df.columns]
df = df.dropna(how='all')
return df

View File

@ -1,47 +0,0 @@
import re
import itertools
geo_admin_keywords = [
'lat', 'lon', 'long', 'latitude', 'longitude', 'koordinat', 'geometry', 'geometri',
'desa', 'kelurahan', 'kel', 'kecamatan', 'kabupaten', 'kab', 'kota', 'provinsi',
'lokasi', 'region', 'area', 'zone', 'boundary', 'batas'
]
def normalize_text(text):
text = text.lower()
text = re.sub(r'[^a-z0-9/ ]+', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def generate_combined_patterns(keywords):
combos = list(itertools.combinations(keywords, 2))
patterns = []
for a, b in combos:
patterns.append(rf'{a}\s*/\s*{b}')
patterns.append(rf'{b}\s*/\s*{a}')
return patterns
combined_patterns = generate_combined_patterns(geo_admin_keywords)
def contains_geo_admin_keywords(text):
text_clean = normalize_text(text)
if len(text_clean) < 3:
return False
for pattern in combined_patterns:
if re.search(pattern, text_clean):
return True
for kw in geo_admin_keywords:
if re.search(rf'(^|[\s/_-]){kw}([\s/_-]|$)', text_clean):
return True
return False
def filter_geo_admin_column(tables):
filtered = []
for table in tables:
found = any(contains_geo_admin_keywords(col) for col in table['columns'])
if found:
filtered.append(table)
return filtered

View File

@ -1,270 +0,0 @@
import pdfplumber
import re
import pandas as pd
from services.upload_file.read_pdf.filter_column import filter_geo_admin_column
def is_number(s):
if s is None:
return False
s = str(s).strip().replace(',', '').replace('.', '')
return s.isdigit()
def row_ratio(row):
non_empty = [c for c in row if c not in (None, '', ' ')]
if not non_empty:
return 0
num_count = sum(is_number(c) for c in non_empty)
return num_count / len(non_empty)
def has_mixed_text_and_numbers(row):
non_empty = [c for c in row if c not in (None, '', ' ')]
has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty)
has_num = any(is_number(c) for c in non_empty)
return has_text and has_num
def is_short_text_row(row):
"""Deteksi baris teks pendek (1-2 kolom teks pendek)."""
non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')]
if not non_empty:
return False
text_only = all(not is_number(c) for c in non_empty)
joined = " ".join(non_empty)
return text_only and len(non_empty) <= 2 and len(joined) < 20
def detect_header_rows(rows):
if not rows:
return []
ratios = [row_ratio(r) for r in rows]
body_start_index = None
for i in range(1, len(rows)):
row = rows[i]
if has_mixed_text_and_numbers(row):
body_start_index = i
break
if ratios[i] > 0.3:
body_start_index = i
break
if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row):
body_start_index = i
break
if ratios[i - 1] == 0 and ratios[i] > 0:
body_start_index = i
break
if body_start_index is None:
body_start_index = len(rows)
potential_headers = rows[:body_start_index]
body_filtered = rows[body_start_index:]
header_filtered = []
for idx, row in enumerate(potential_headers):
if is_short_text_row(row):
if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0:
header_filtered.append(row)
else:
continue
else:
header_filtered.append(row)
return header_filtered, body_filtered
def merge_multiline_header(header_rows):
final_header = []
for col in zip(*header_rows):
val = next((v for v in reversed(col) if v and str(v).strip()), '')
val = str(val).replace('\n', ' ').strip()
final_header.append(val)
final_header = [v for v in final_header if v not in ['', None]]
return final_header
NUMBER_HEADER_KEYWORDS = ["no","no.","no .","no . ","no :","no : ","nomor","nomor.","nomor :","nomor urut","no urut","no. urut","no-urut","no_urut","nomor_urut","nomor-urut","No","NO","NO.","No.","No :","NO :","Nomor","NOMOR","Nomor Urut","NOMOR URUT","No Urut","NO URUT","No. Urut","NO. URUT","No /","No / ","No / Nama","No -","No - ","Nomor /","Nomor -","Number","No. of","No of","Index","Serial","Order","ID","ID No","ID No.","Sr No","Sr. No","S/N","SN","Sl No","Sl. No","N0","N0.","N0 :","NOM0R","NOM0R URUT","N0MOR",]
def has_number_header(header):
"""Periksa apakah header mengandung kolom No/Nomor."""
header_text = header
return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS)
def is_numbering_column(col_values):
"""Periksa apakah kolom pertama diisi nomor urut seperti 1, 01, 2, dst."""
numeric_like = 0
total = 0
for v in col_values:
if not v or not isinstance(v, str):
continue
total += 1
if re.fullmatch(r"0*\d{1,3}", v.strip()):
numeric_like += 1
return total > 0 and (numeric_like / total) > 0.6
def is_numeric_value(v):
"""Cek apakah suatu nilai termasuk angka (int, float, atau string angka)."""
if v is None:
return False
if isinstance(v, (int, float)):
return True
if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()):
return True
return False
def cleaning_column(headers, bodies):
cleaned_bodies = []
for header, body in zip(headers, bodies):
if not body:
cleaned_bodies.append(body)
continue
header_has_number = has_number_header(header)
first_col = [row[0] for row in body if row and len(row) > 0]
first_col_is_numbering = is_numbering_column(first_col)
if not header_has_number and first_col_is_numbering:
new_body = []
for row in body:
if not row:
continue
first_val = row[0]
if is_numeric_value(first_val) and len(row) > 1:
new_body.append(row[1:])
else:
new_body.append(row)
body = new_body
header_len = len(headers)
filtered_body = [row for row in body if len(row) == header_len]
cleaned_bodies.append(filtered_body)
return cleaned_bodies
def parse_page_selection(selectedPage: str, total_pages: int):
if not selectedPage:
return list(range(1, total_pages + 1))
pages = set()
parts = re.split(r'[,\s]+', selectedPage.strip())
for part in parts:
if '-' in part:
try:
start, end = map(int, part.split('-'))
pages.update(range(start, end + 1))
except ValueError:
continue
else:
try:
pages.add(int(part))
except ValueError:
continue
valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages]
return valid_pages
def read_pdf(path: str, page: str):
pdf_path = path
selectedPage = None
# if page == '' or None:
# selectedPage = "1"
if not page:
selectedPage = "1"
else:
selectedPage = page
tables_data = []
with pdfplumber.open(pdf_path) as pdf:
total_pages = len(pdf.pages)
selected_pages = parse_page_selection(selectedPage, total_pages)
print(f"[INFO] Total halaman PDF: {total_pages}")
print(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
for page_num in selected_pages:
pdf_page = pdf.pages[page_num - 1] # index pdfplumber mulai dari 0
tables = pdf_page.find_tables()
print(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
for t in tables:
table = t.extract()
if len(table) > 2:
tables_data.append(table)
print(f"\nTotal tabel valid: {len(tables_data)}\n")
header_only = []
body_only = []
for tbl in tables_data:
head, body = detect_header_rows(tbl)
header_only.append(head)
body_only.append(body)
clean_header = []
for h in header_only:
clean_header.append(merge_multiline_header(h))
clean_body=[]
for i, raw_body in enumerate(body_only):
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
cleaned = cleaning_column(clean_header[i], [con_body])
clean_body.append(cleaned[0])
parsed = []
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
parsed.append({
"title": str(i),
"columns": cols,
"rows": rows
})
clean_parsed = filter_geo_admin_column(parsed)
# print(f"parsed{clean_parsed}")
return clean_parsed
def convert_df(payload):
if "columns" not in payload or "rows" not in payload:
raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.")
if not isinstance(payload["columns"], list):
raise TypeError("'columns' harus berupa list.")
if not isinstance(payload["rows"], list):
raise TypeError("'rows' harus berupa list.")
for i, row in enumerate(payload["rows"]):
if len(row) != len(payload["columns"]):
raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.")
df = pd.DataFrame(payload["rows"], columns=payload["columns"])
if "title" in payload:
df.attrs["title"] = payload["title"]
return df
def test_read_pdf():
# single
# parsed = [{'title': 'Tabel 3.49. Potensi Penduduk Terpapar Bencana Banjir di Provinsi Jawa Timur', 'columns': ['No', 'Kabupaten/Kota', 'Jumlah Penduduk Terpapar (Jiwa)', 'Penduduk Umur Rentan', 'Penduduk Miskin', 'Penduduk Disabilitas', 'Kelas'], 'rows': [['1', 'PACITAN', '111.309', '14.142', '9.307', '781', 'SEDANG'], ['2', 'PONOROGO', '381.579', '50.815', '44.256', '2.346', 'SEDANG'], ['3', 'TRENGGALEK', '284.509', '34.304', '33.653', '1.945', 'SEDANG'], ['4', 'TULUNGAGUNG', '777.174', '86.452', '67.952', '3.200', 'SEDANG'], ['5', 'BLITAR', '226.767', '25.032', '22.554', '909', 'SEDANG'], ['6', 'KEDIRI', '545.961', '59.272', '74.578', '2.539', 'SEDANG'], ['7', 'MALANG', '238.170', '23.646', '25.388', '641', 'SEDANG'], ['8', 'LUMAJANG', '267.926', '30.206', '33.738', '970', 'SEDANG'], ['9', 'JEMBER', '1.061.703', '109.355', '105.958', '2.424', 'SEDANG'], ['10', 'BANYUWANGI', '442.290', '51.294', '44.107', '1.168', 'SEDANG'], ['11', 'BONDOWOSO', '143.452', '18.178', '21.676', '517', 'SEDANG'], ['12', 'SITUBONDO', '233.211', '26.799', '54.221', '928', 'SEDANG'], ['13', 'PROBOLINGGO', '326.005', '37.002', '58.562', '1.323', 'SEDANG'], ['14', 'PASURUAN', '485.143', '49.285', '65.076', '1.576', 'SEDANG'], ['15', 'SIDOARJO', '1.930.615', '172.191', '132.673', '3.987', 'SEDANG'], ['16', 'MOJOKERTO', '498.583', '52.453', '49.831', '1.491', 'SEDANG'], ['17', 'JOMBANG', '876.937', '92.415', '107.447', '4.985', 'SEDANG'], ['18', 'NGANJUK', '829.022', '95.454', '117.127', '3.029', 'SEDANG'], ['19', 'MADIUN', '363.763', '44.997', '44.877', '1.695', 'SEDANG'], ['20', 'MAGETAN', '117.247', '15.706', '11.051', '652', 'SEDANG'], ['21', 'NGAWI', '419.065', '49.864', '65.877', '1.572', 'SEDANG'], ['22', 'BOJONEGORO', '910.377', '100.800', '117.977', '3.557', 'SEDANG'], ['23', 'TUBAN', '507.407', '51.775', '60.834', '2.206', 'SEDANG'], ['24', 'LAMONGAN', '884.503', '99.928', '96.031', '3.960', 'SEDANG'], ['25', 'GRESIK', '613.133', '59.848', '49.854', '1.666', 'SEDANG'], ['26', 'BANGKALAN', '312.149', '31.075', '36.099', '1.169', 'SEDANG'], ['27', 'SAMPANG', '239.656', '28.756', '39.790', '1.280', 'SEDANG'], ['28', 'PAMEKASAN', '216.423', '25.831', '30.296', '776', 'SEDANG'], ['29', 'SUMENEP', '217.805', '24.741', '33.293', '1.088', 'SEDANG'], ['1', 'KOTA KEDIRI', '162.064', '17.129', '13.997', '363', 'SEDANG'], ['2', 'KOTA BLITAR', '21.390', '2.242', '1.185', '79', 'SEDANG'], ['3', 'KOTA MALANG', '148.072', '15.499', '6.142', '201', 'SEDANG'], ['4', 'KOTA PROBOLINGGO', '117.911', '12.708', '10.913', '420', 'SEDANG'], ['5', 'KOTA PASURUAN', '199.602', '20.199', '19.721', '516', 'SEDANG'], ['6', 'KOTA MOJOKERTO', '139.962', '14.486', '6.971', '584', 'SEDANG'], ['7', 'KOTA MADIUN', '149.468', '17.255', '6.300', '304', 'SEDANG'], ['8', 'KOTA SURABAYA', '2.469.639', '244.061', '133.953', '3.838', 'SEDANG'], ['9', 'KOTA BATU', '8.858', '939', '529', '13', 'SEDANG'], ['-', 'Provinsi Jawa Timur', '17.878.850', '1.906.134', '1.853.794', '60.698', 'SEDANG']]}]
# double
parsed = [{"title":"Luas Catchment Area (km2) Pada Wilayah Sungai di Provinsi Jawa Timur","columns":["Wilayah Sungai","Luas (km2)","Jumlah DAS"],"rows":[["Bengawan Solo","13.070,00","94 DAS"],["Brantas","13.880,00","20 DAS"],["Welang -Rejoso","2.601,00","36 DAS"],["Pekalen -Sampean","3.953,00","56 DAS"],["Baru -Bajulmati","3.675,00","60 DAS"],["Bondoyudo -Bedadung","5.364,00","47 DAS"],["Madura","4.575,00","173 DAS"]]},{"title":"Jumlah dan Kepadatan Penduduk Menurut Kabupaten\/kota di Provinsi Jawa Timur Tahun 2021","columns":["Kabupaten\/Kota","Jumlah Penduduk","Persentase","Kepadatan Penduduk (Jiwa per Km2)"],"rows":[["Bangkalan","1.082.759","2,64","1.081,20"],["Banyuwangi","1.749.773","4,27","302,60"],["Blitar","1.228.292","3,00","919,05"],["Bojonegoro","1.343.895","3,28","611,20"],["Bondowoso","801.541","1,96","525,27"],["Gresik","1.283.961","3,13","1.077,83"],["Jember","2.581.486","6,30","834,80"],["Jombang","1.350.483","3,29","1.211,10"],["Kediri","1.671.821","4,08","1.206,18"],["Lamongan","1.379.731","3,37","774,24"],["Lumajang","1.091.856","2,66","609,67"],["Madiun","754.263","1,84","726,94"],["Magetan","689.369","1,68","1.000,77"],["Malang","2.611.907","6,37","739,78"],["Mojokerto","1.126.540","2,75","1.569,37"],["Nganjuk","1.133.556","2,77","925,92"],["Ngawi","896.768","2,19","691,96"],["Pacitan","597.580","1,46","429,94"],["Pamekasan","840.790","2,05","1.061,28"],["Pasuruan","1.603.754","3,91","1.088,01"],["Ponorogo","968.681","2,36","741,89"],["Probolinggo","1.156.570","2,82","681,86"],["Sampang","902.514","2,20","731,92"],["Sidoarjo","1.951.723","4,76","3.076,58"],["Situbondo","666.245","1,63","398,98"],["Sumenep","1.134.750","2,77","567,79"],["Trenggalek","746.734","1,82","650,91"],["Tuban","1.223.257","2,98","666,93"],["Tulungagung","1.126.679","2,75","1.067,28"],["Kota Batu","215.248","0,53","1.574,14"],["Kota Blitar","158.123","0,39","4.854,87"],["Kota Kediri","292.363","0,71","4.611,40"],["Kota Madiun","201.243","0,49","6.045,15"],["Kota Malang","866.356","2,11","5.963,35"],["Kota Mojokerto","139.961","0,34","8.497,94"],["Kota Pasuruan","210.341","0,51","5.960,36"],["Kota Probolinggo","242.246","0,59","4.274,68"],["Kota Surabaya","2.970.843","7,25","8.475,05"],["Provinsi Jawa Timur","40.994.002","100,00","76.228,17"]]}]
# df = convert_df(parsed, table_index=0)
return parsed

View File

@ -0,0 +1,116 @@
import pandas as pd
import re
import csv
import os
def detect_header_line(path, max_rows=10):
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
lines = [next(f) for _ in range(max_rows)]
header_line_idx = 0
best_score = -1
for i, line in enumerate(lines):
cells = re.split(r'[;,|\t]', line.strip())
alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
score = alpha_ratio - digit_ratio
if score > best_score:
best_score = score
header_line_idx = i
return header_line_idx
def detect_delimiter(path, sample_size=2048):
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
sample = f.read(sample_size)
sniffer = csv.Sniffer()
try:
dialect = sniffer.sniff(sample)
return dialect.delimiter
except Exception:
for delim in [',', ';', '\t', '|']:
if delim in sample:
return delim
return ','
def read_csv(path: str, sheet: str = None):
ext = os.path.splitext(path)[1].lower()
try:
if ext in ['.csv']:
header_line = detect_header_line(path)
delimiter = detect_delimiter(path)
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
df = pd.read_csv(
path,
header=header_line,
sep=delimiter,
encoding='utf-8',
low_memory=False,
thousands=','
)
elif ext in ['.xlsx', '.xls']:
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
xls = pd.ExcelFile(path)
print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
if sheet:
if sheet not in xls.sheet_names:
raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}")
print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str)
df = df.dropna(how='all').dropna(axis=1, how='all')
else:
print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...")
best_sheet = None
best_score = -1
best_df = None
for sheet_name in xls.sheet_names:
try:
temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all')
if len(temp_df) == 0 or len(temp_df.columns) < 2:
continue
# hitung skor relevansi
text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1)
row_score = len(temp_df)
score = (row_score * 0.7) + (text_ratio * 100)
if score > best_score:
best_score = score
best_sheet = sheet_name
best_df = temp_df
except Exception as e:
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
continue
if best_df is not None:
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
df = best_df
else:
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
for col in df.columns:
if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
df[col] = df[col].astype(str).str.replace(',', '', regex=False)
df[col] = pd.to_numeric(df[col], errors='ignore')
else:
raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
except Exception as e:
print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
df.columns = [str(c).strip() for c in df.columns]
df = df.dropna(how='all')
return df

View File

@ -0,0 +1,168 @@
import re
import pdfplumber
import pandas as pd
from services.upload_file.utils.pdf_cleaner import row_ratio, has_mixed_text_and_numbers, is_short_text_row, parse_page_selection, filter_geo_admin_column, cleaning_column
from services.upload_file.upload_exceptions import PDFReadError
from utils.logger_config import setup_logger
logger = setup_logger(__name__)
def detect_header_rows(rows):
if not rows:
return []
ratios = [row_ratio(r) for r in rows]
body_start_index = None
for i in range(1, len(rows)):
row = rows[i]
if has_mixed_text_and_numbers(row):
body_start_index = i
break
if ratios[i] > 0.3:
body_start_index = i
break
if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row):
body_start_index = i
break
if ratios[i - 1] == 0 and ratios[i] > 0:
body_start_index = i
break
if body_start_index is None:
body_start_index = len(rows)
potential_headers = rows[:body_start_index]
body_filtered = rows[body_start_index:]
header_filtered = []
for idx, row in enumerate(potential_headers):
if is_short_text_row(row):
if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0:
header_filtered.append(row)
else:
continue
else:
header_filtered.append(row)
return header_filtered, body_filtered
def merge_multiline_header(header_rows):
final_header = []
for col in zip(*header_rows):
val = next((v for v in reversed(col) if v and str(v).strip()), '')
val = str(val).replace('\n', ' ').strip()
final_header.append(val)
final_header = [v for v in final_header if v not in ['', None]]
return final_header
def read_pdf(path: str, page: str):
"""
Membaca tabel dari file PDF secara semi-otomatis menggunakan `pdfplumber`.
Alur utama proses:
1. **Buka file PDF** menggunakan pdfplumber.
2. **Pilih halaman** berdasarkan input `page` (misalnya "1,3-5" untuk halaman 1 dan 35).
3. **Deteksi tabel** di setiap halaman yang dipilih.
4. **Ekstraksi tabel mentah** (list of list) dari setiap halaman.
5. **Pisahkan baris header dan body** dengan fungsi `detect_header_rows()`.
6. **Gabungkan header multi-baris** (misalnya tabel dengan dua baris judul kolom).
7. **Bersihkan body tabel** menggunakan `cleaning_column()`:
- Menghapus kolom nomor urut.
- Menyesuaikan jumlah kolom dengan header.
8. **Gabungkan hasil akhir** ke dalam format JSON dengan struktur:
{
"title": <nomor tabel>,
"columns": [...],
"rows": [...]
}
9. **Filter tambahan** dengan `filter_geo_admin_column()` (khusus metadata geospasial).
10. **Kembalikan hasil** berupa list JSON siap dikirim ke frontend API.
Args:
path (str): Lokasi file PDF yang akan dibaca.
page (str): Nomor halaman atau rentang halaman, contoh: "1", "2-4", "1,3-5".
Returns:
list[dict]: Daftar tabel hasil ekstraksi dengan struktur kolom dan baris.
Raises:
PDFReadError: Jika terjadi kesalahan saat membaca atau parsing PDF.
"""
try:
pdf_path = path
selectedPage = page if page else "1"
tables_data = []
with pdfplumber.open(pdf_path) as pdf:
total_pages = len(pdf.pages)
selected_pages = parse_page_selection(selectedPage, total_pages)
logger.info(f"[INFO] Total halaman PDF: {total_pages}")
logger.info(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
for page_num in selected_pages:
pdf_page = pdf.pages[page_num - 1]
tables = pdf_page.find_tables()
logger.info(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
for t in tables:
table = t.extract()
if len(table) > 2:
tables_data.append(table)
logger.info(f"\nTotal tabel valid: {len(tables_data)}\n")
header_only, body_only = [], []
for tbl in tables_data:
head, body = detect_header_rows(tbl)
header_only.append(head)
body_only.append(body)
clean_header = [merge_multiline_header(h) for h in header_only]
clean_body = []
for i, raw_body in enumerate(body_only):
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
cleaned = cleaning_column(clean_header[i], [con_body])
clean_body.append(cleaned[0])
parsed = []
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
parsed.append({
"title": str(i),
"columns": cols,
"rows": rows
})
clean_parsed = filter_geo_admin_column(parsed)
return clean_parsed
except Exception as e:
raise PDFReadError(f"Gagal membaca PDF: {e}", code=422)
def convert_df(payload):
try:
if "columns" not in payload or "rows" not in payload:
raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.")
if not isinstance(payload["columns"], list):
raise TypeError("'columns' harus berupa list.")
if not isinstance(payload["rows"], list):
raise TypeError("'rows' harus berupa list.")
for i, row in enumerate(payload["rows"]):
if len(row) != len(payload["columns"]):
raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.")
df = pd.DataFrame(payload["rows"], columns=payload["columns"])
if "title" in payload:
df.attrs["title"] = payload["title"]
return df
except Exception as e:
raise PDFReadError(f"Gagal konversi payload ke DataFrame: {e}", code=400)

View File

@ -1,27 +1,30 @@
import json
import os import os
import pandas as pd import pandas as pd
import geopandas as gpd import geopandas as gpd
import numpy as np import numpy as np
import re
import zipfile import zipfile
from shapely.geometry.base import BaseGeometry from shapely.geometry.base import BaseGeometry
from shapely.geometry import base as shapely_base from shapely.geometry import base as shapely_base
from fastapi import File, Form, UploadFile, HTTPException from fastapi import File, Form, UploadFile, HTTPException
from fastapi.responses import JSONResponse
from core.config import UPLOAD_FOLDER, MAX_FILE_MB, VALID_WKT_PREFIXES from core.config import UPLOAD_FOLDER, MAX_FILE_MB, VALID_WKT_PREFIXES
from services.upload_file.read_csv.reader_csv import read_csv from services.upload_file.readers.reader_csv import read_csv
from services.upload_file.read_shp.reader_shp import read_shp from services.upload_file.readers.reader_shp import read_shp
from services.upload_file.read_gdb.reader_gdb import read_gdb from services.upload_file.readers.reader_gdb import read_gdb
from services.upload_file.read_mpk.reader_mpk import read_mpk from services.upload_file.readers.reader_mpk import read_mpk
from services.upload_file.read_pdf.reader_pdf import convert_df, read_pdf from services.upload_file.readers.reader_pdf import convert_df, read_pdf
from services.upload_file.geom_detector.geometry_detector import detect_and_build_geometry from services.upload_file.utils.geometry_detector import detect_and_build_geometry
from services.upload_file.geom_detector.geometry_detector import attach_polygon_geometry_auto from services.upload_file.utils.geometry_detector import attach_polygon_geometry_auto
from database.connection import engine from database.connection import engine
from database.models import Base from database.models import Base
from pydantic import BaseModel from pydantic import BaseModel
from typing import List, Optional from typing import List, Optional
from shapely import wkt from shapely import wkt
from sqlalchemy import text from sqlalchemy import text
Base.metadata.create_all(bind=engine) from datetime import datetime
from response import successRes, errorRes
# Base.metadata.create_all(bind=engine)
def is_geom_empty(g): def is_geom_empty(g):
@ -78,7 +81,7 @@ def process_data(df: pd.DataFrame, ext: str):
print(f"[INFO] Tipe Geometry: {geom_type}") print(f"[INFO] Tipe Geometry: {geom_type}")
print(f"[INFO] Jumlah geometry kosong: {null_geom}") print(f"[INFO] Jumlah geometry kosong: {null_geom}")
else: else:
response = { res = {
"message": "Tidak menemukan tabel yang relevan.", "message": "Tidak menemukan tabel yang relevan.",
"file_type": ext, "file_type": ext,
"rows": 0, "rows": 0,
@ -91,7 +94,7 @@ def process_data(df: pd.DataFrame, ext: str):
"preview": [] "preview": []
} }
return JSONResponse(content=response) return errorRes(message="Tidak berhasil mencocokan geometry pada tabel." ,details=res, status_code=422)
result = result.replace([pd.NA, float('inf'), float('-inf')], None) result = result.replace([pd.NA, float('inf'), float('-inf')], None)
if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns: if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns:
@ -116,7 +119,8 @@ def process_data(df: pd.DataFrame, ext: str):
else: else:
warning_examples = [] warning_examples = []
preview_data = result.to_dict(orient="records") preview_data = result.head(10).to_dict(orient="records")
# preview_data = result.to_dict(orient="records")
preview_safe = [ preview_safe = [
{k: safe_json(v) for k, v in row.items()} for row in preview_data {k: safe_json(v) for k, v in row.items()} for row in preview_data
@ -139,7 +143,7 @@ def process_data(df: pd.DataFrame, ext: str):
"preview": preview_safe "preview": preview_safe
} }
# return JSONResponse(content=response) # return successRes(content=response)
return response return response
@ -157,7 +161,7 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] =
contents = await file.read() contents = await file.read()
size_mb = len(contents) / (1024*1024) size_mb = len(contents) / (1024*1024)
if size_mb > MAX_FILE_MB: if size_mb > MAX_FILE_MB:
raise HTTPException(status_code=413, detail="Ukuran File Terlalu Besar") raise errorRes(status_code=413, message="Ukuran File Terlalu Besar")
tmp_path = UPLOAD_FOLDER / fname tmp_path = UPLOAD_FOLDER / fname
with open(tmp_path, "wb") as f: with open(tmp_path, "wb") as f:
f.write(contents) f.write(contents)
@ -174,19 +178,19 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] =
elif ext == ".pdf": elif ext == ".pdf":
tbl = read_pdf(tmp_path, page) tbl = read_pdf(tmp_path, page)
if len(tbl) == 0: if len(tbl) == 0:
response = { res = {
"message": "Tidak ditemukan tabel valid", "message": "Tidak ditemukan tabel valid",
"tables": tbl, "tables": {},
"file_type": ext "file_type": ext
} }
return JSONResponse(content=response) return successRes(message="Tidak ditemukan tabel valid", data=res)
elif len(tbl) > 1: elif len(tbl) > 1:
response = { res = {
"message": "File berhasil dibaca dan dianalisis.", "message": "File berhasil dibaca dan dianalisis.",
"tables": tbl, "tables": tbl,
"file_type": ext "file_type": ext
} }
return JSONResponse(content=response) return successRes(data=res, message="File berhasil dibaca dan dianalisis.")
else: else:
df = convert_df(tbl[0]) df = convert_df(tbl[0])
elif ext == ".zip": elif ext == ".zip":
@ -201,25 +205,29 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] =
df = read_gdb(str(tmp_path)) df = read_gdb(str(tmp_path))
else: else:
raise HTTPException( raise errorRes(
status_code=400, status_code=400,
detail="ZIP file tidak mengandung SHP atau GDB yang valid." message="ZIP file tidak mengandung SHP atau GDB yang valid."
) )
else: else:
raise HTTPException(status_code=400, detail="Unsupported file type") raise errorRes(status_code=400, message="Unsupported file type")
if df is None or (hasattr(df, "empty") and df.empty): if df is None or (hasattr(df, "empty") and df.empty):
return JSONResponse({"error": "No valid table detected"}, status_code=400) return successRes(message="File berhasil dibaca, Tetapi tidak ditemukan tabel valid")
res = process_data(df, ext) res = process_data(df, ext)
tmp_path.unlink(missing_ok=True) tmp_path.unlink(missing_ok=True)
return JSONResponse(content=res) return successRes(data=res)
except Exception as e: except Exception as e:
print(f"[ERROR] {e}") print(f"[ERROR] {e}")
return JSONResponse({"error": str(e)}, status_code=500) return errorRes(
message="Internal Server Error",
details=str(e),
status_code=500
)
# finally: # finally:
# db_session.close() # db_session.close()
@ -237,16 +245,15 @@ async def handle_process_pdf(payload: PdfRequest):
try: try:
df = convert_df(payload.model_dump()) df = convert_df(payload.model_dump())
if df is None or (hasattr(df, "empty") and df.empty): if df is None or (hasattr(df, "empty") and df.empty):
return JSONResponse({"error": "No valid table detected"}, status_code=400) return errorRes(message="Tidak ada tabel")
res = process_data(df, '.pdf') res = process_data(df, '.pdf')
return successRes(data=res)
return JSONResponse(content=res)
except Exception as e: except Exception as e:
print(f"[ERROR] {e}") print(f"[ERROR] {e}")
return JSONResponse({"error": str(e)}, status_code=500) return errorRes(message="Internal Server Error", details= str(e), status_code=500)
# finally: # finally:
# db_session.close() # db_session.close()
@ -263,39 +270,351 @@ class UploadRequest(BaseModel):
rows: List[dict] rows: List[dict]
columns: List[str] columns: List[str]
def handle_to_postgis(payload: UploadRequest): # import time
try: # def handle_to_postgis(payload: UploadRequest):
table_name = payload.title.lower().replace(" ", "_").replace("-","_") # # time.sleep(2)
# # return {
# # "table_name": 'test',
# # "status": "success",
# # "message": f"Tabel test berhasil diunggah ke PostGIS.",
# # "total_rows": 999,
# # "geometry_type": 'POINT'
# # }
# try:
# table_name = payload.title.lower().replace(" ", "_").replace("-","_")
# df = pd.DataFrame(payload.rows)
# print(f"[INFO] Diterima {len(df)} baris data dari frontend.")
# if "geometry" in df.columns:
# df["geometry"] = df["geometry"].apply(
# lambda g: wkt.loads(g) if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) else None
# )
# gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
# else:
# raise HTTPException(status_code=400, detail="Kolom geometry tidak ditemukan dalam data.")
# with engine.begin() as conn:
# conn.execute(text(f"DROP TABLE IF EXISTS {table_name}"))
# gdf.to_postgis(table_name, engine, if_exists="replace", index=False)
# with engine.begin() as conn:
# conn.execute(text(f'ALTER TABLE "{table_name}" ADD COLUMN _id SERIAL PRIMARY KEY;'))
# print(f"[INFO] Tabel '{table_name}' berhasil dibuat di PostGIS ({len(gdf)} baris).")
# return {
# "table_name": table_name,
# "status": "success",
# "message": f"Tabel '{table_name}' berhasil diunggah ke PostGIS.",
# "total_rows": len(gdf),
# "geometry_type": list(gdf.geom_type.unique())
# }
# except Exception as e:
# print(f"[ERROR] Gagal upload ke PostGIS: {e}")
# raise HTTPException(status_code=500, detail=str(e))
# VALID_WKT_PREFIXES = ("POINT", "LINESTRING", "POLYGON", "MULTIPOLYGON", "MULTILINESTRING")
# def handle_to_postgis(payload: UploadRequest, user_id: int = 2):
# try:
# table_name = "test_partition"
# df = pd.DataFrame(payload.rows)
# print(f"[INFO] Diterima {len(df)} baris data dari frontend.")
# if "geometry" not in df.columns:
# raise HTTPException(status_code=400, detail="Kolom geometry tidak ditemukan dalam data.")
# df["geometry"] = df["geometry"].apply(
# lambda g: wkt.loads(g) if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) else None
# )
# gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
# insert_count = 0
# with engine.begin() as conn:
# # 💡 Tambahkan blok auto-create partisi di sini
# conn.execute(text(f"""
# DO $$
# BEGIN
# IF NOT EXISTS (
# SELECT 1 FROM pg_tables WHERE tablename = 'test_partition_user_{user_id}'
# ) THEN
# EXECUTE format('
# CREATE TABLE test_partition_user_%s
# PARTITION OF test_partition
# FOR VALUES IN (%s);
# ', {user_id}, {user_id});
# EXECUTE format('CREATE INDEX ON test_partition_user_%s USING GIST (geom);', {user_id});
# EXECUTE format('CREATE INDEX ON test_partition_user_%s USING GIN (properties);', {user_id});
# END IF;
# END
# $$;
# """))
# # Lanjut insert data seperti biasa
# for _, row in gdf.iterrows():
# geom_wkt = row["geometry"].wkt if row["geometry"] is not None else None
# properties = row.drop(labels=["geometry"]).to_dict()
# conn.execute(
# text("""
# INSERT INTO test_partition (user_id, geom, properties, created_at)
# VALUES (:user_id, ST_Force2D(ST_GeomFromText(:geom, 4326)), CAST(:properties AS jsonb), :created_at)
# """),
# {
# "user_id": user_id,
# "geom": geom_wkt,
# "properties": json.dumps(properties),
# "created_at": datetime.utcnow()
# }
# )
# insert_count += 1
# print(f"[INFO] Berhasil memasukkan {insert_count} baris ke partisi user_id={user_id}.")
# return {
# "table_name": table_name,
# "user_id": user_id,
# "status": "success",
# "message": f"Data berhasil ditambahkan ke partisi user_id={user_id}.",
# "total_rows": insert_count,
# "geometry_type": list(gdf.geom_type.unique())
# }
# except Exception as e:
# print(f"[ERROR] Gagal upload ke PostGIS partition: {e}")
# raise HTTPException(status_code=500, detail=str(e))
# Daftar prefix WKT yang valid
VALID_WKT_PREFIXES = ("POINT", "LINESTRING", "POLYGON", "MULTIPOLYGON", "MULTILINESTRING")
def slugify(value: str) -> str:
"""Mengubah judul dataset jadi nama aman untuk VIEW"""
return re.sub(r'[^a-zA-Z0-9]+', '_', value.lower()).strip('_')
# async def create_dataset_view_from_metadata(conn, metadata_id: int, user_id: int, title: str):
# """Membuat VIEW PostgreSQL berdasarkan metadata dataset dan registrasi geometry untuk QGIS."""
# norm_title = slugify(title)
# view_name = f"v_user_{user_id}_{norm_title}"
# base_table = f"test_partition_user_{user_id}"
# # 1⃣ Hapus view lama jika ada
# drop_query = text(f"DROP VIEW IF EXISTS {view_name} CASCADE;")
# await conn.execute(drop_query)
# # 2⃣ Buat view baru
# create_view_query = text(f"""
# CREATE OR REPLACE VIEW {view_name} AS
# SELECT p.*, m.title, m.year, m.description
# FROM {base_table} p
# JOIN dataset_metadata m ON m.id = p.metadata_id
# WHERE p.metadata_id = {metadata_id};
# """)
# await conn.execute(create_view_query)
# # 3⃣ Daftarkan geometry column agar QGIS mengenali layer ini
# # (gunakan Populate_Geometry_Columns jika PostGIS >= 3)
# populate_query = text(f"SELECT Populate_Geometry_Columns('{view_name}'::regclass);")
# await conn.execute(populate_query)
# print(f"[INFO] VIEW {view_name} berhasil dibuat dan geometry terdaftar.")
async def create_dataset_view_from_metadata(conn, metadata_id: int, user_id: int, title: str):
"""Buat VIEW dinamis sesuai struktur atribut JSON pada dataset (hindari duplikasi nama kolom)."""
norm_title = slugify(title)
view_name = f"v_user_{user_id}_{norm_title}"
base_table = f"test_partition_user_{user_id}"
# Ambil daftar field dari metadata
result = await conn.execute(text("SELECT fields FROM dataset_metadata WHERE id=:mid"), {"mid": metadata_id})
fields_json = result.scalar_one_or_none()
# --- daftar kolom bawaan dari tabel utama
base_columns = {"id", "user_id", "metadata_id", "geom"}
columns_sql = ""
field_list = []
if fields_json:
try:
# handle jika data sudah berupa list atau string JSON
if isinstance(fields_json, str):
field_list = json.loads(fields_json)
elif isinstance(fields_json, list):
field_list = fields_json
else:
raise ValueError(f"Tipe data fields_json tidak dikenali: {type(fields_json)}")
for f in field_list:
safe_col = slugify(f)
# Hindari duplikat nama dengan kolom utama
if safe_col in base_columns:
alias_name = f"attr_{safe_col}"
else:
alias_name = safe_col
columns_sql += f", p.attributes->>'{f}' AS {alias_name}"
except Exception as e:
print(f"[WARN] Gagal parse field list metadata: {e}")
# 1⃣ Drop view lama
await conn.execute(text(f"DROP VIEW IF EXISTS {view_name} CASCADE;"))
# 2⃣ Buat view baru dinamis
create_view_query = f"""
CREATE OR REPLACE VIEW {view_name} AS
SELECT p.id, p.user_id, p.metadata_id, p.geom
{columns_sql},
m.title, m.year, m.description
FROM {base_table} p
JOIN dataset_metadata m ON m.id = p.metadata_id
WHERE p.metadata_id = {metadata_id};
"""
await conn.execute(text(create_view_query))
# 3⃣ Register geometry untuk QGIS
await conn.execute(text(f"SELECT Populate_Geometry_Columns('{view_name}'::regclass);"))
print(f"[INFO] VIEW {view_name} berhasil dibuat (kolom dinamis: {field_list if field_list else '(none)'}).")
async def handle_to_postgis(payload, engine, user_id: int = 3):
"""
Menangani upload data spasial ke PostGIS (dengan partition per user).
- Jika partisi belum ada, akan dibuat otomatis
- Metadata dataset disimpan di tabel dataset_metadata
- Data spasial dimasukkan ke tabel partisi (test_partition_user_{id})
- VIEW otomatis dibuat untuk QGIS
"""
try:
df = pd.DataFrame(payload.rows) df = pd.DataFrame(payload.rows)
print(f"[INFO] Diterima {len(df)} baris data dari frontend.") print(f"[INFO] Diterima {len(df)} baris data dari frontend.")
if "geometry" in df.columns: # --- Validasi kolom geometry ---
df["geometry"] = df["geometry"].apply( if "geometry" not in df.columns:
lambda g: wkt.loads(g) if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) else None raise errorRes(status_code=400, message="Kolom 'geometry' tidak ditemukan dalam data.")
# --- Parsing geometry ke objek shapely ---
df["geometry"] = df["geometry"].apply(
lambda g: wkt.loads(g)
if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES)
else None
)
# --- Buat GeoDataFrame ---
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
# --- Metadata info dari payload ---
# dataset_title = getattr(payload, "dataset_title", None)
# dataset_year = getattr(payload, "dataset_year", None)
# dataset_desc = getattr(payload, "dataset_description", None)
dataset_title = "longsor 2020"
dataset_year = 2020
dataset_desc = "test metadata"
if not dataset_title:
raise errorRes(status_code=400, detail="Field 'dataset_title' wajib ada untuk metadata.")
async with engine.begin() as conn:
fields = [col for col in df.columns if col != "geometry"]
# 💾 1⃣ Simpan Metadata Dataset
print("[INFO] Menyimpan metadata dataset...")
result = await conn.execute(
text("""
INSERT INTO dataset_metadata (user_id, title, year, description, fields, created_at)
VALUES (:user_id, :title, :year, :desc, :fields, :created_at)
RETURNING id;
"""),
{
"user_id": user_id,
"title": dataset_title,
"year": dataset_year,
"desc": dataset_desc,
"fields": json.dumps(fields),
"created_at": datetime.utcnow(),
},
) )
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") metadata_id = result.scalar_one()
else: print(f"[INFO] Metadata disimpan dengan ID {metadata_id}")
raise HTTPException(status_code=400, detail="Kolom geometry tidak ditemukan dalam data.")
with engine.begin() as conn: # ⚙️ 2⃣ Auto-create Partisi Jika Belum Ada
conn.execute(text(f"DROP TABLE IF EXISTS {table_name}")) print(f"[INFO] Memastikan partisi test_partition_user_{user_id} tersedia...")
await conn.execute(
text(f"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_tables WHERE tablename = 'test_partition_user_{user_id}'
) THEN
EXECUTE format('
CREATE TABLE test_partition_user_%s
PARTITION OF test_partition
FOR VALUES IN (%s);
', {user_id}, {user_id});
EXECUTE format('CREATE INDEX IF NOT EXISTS idx_partition_user_%s_geom ON test_partition_user_%s USING GIST (geom);', {user_id}, {user_id});
EXECUTE format('CREATE INDEX IF NOT EXISTS idx_partition_user_%s_metadata ON test_partition_user_%s (metadata_id);', {user_id}, {user_id});
END IF;
END
$$;
""")
)
gdf.to_postgis(table_name, engine, if_exists="replace", index=False) # 🧩 3⃣ Insert Data Spasial ke Partisi
print(f"[INFO] Memasukkan data ke test_partition_user_{user_id} ...")
insert_count = 0
for _, row in gdf.iterrows():
geom_wkt = row["geometry"].wkt if row["geometry"] is not None else None
attributes = row.drop(labels=["geometry"]).to_dict()
with engine.begin() as conn: await conn.execute(
conn.execute(text(f'ALTER TABLE "{table_name}" ADD COLUMN _id SERIAL PRIMARY KEY;')) text("""
INSERT INTO test_partition (user_id, metadata_id, geom, attributes, created_at)
VALUES (:user_id, :metadata_id, ST_Force2D(ST_GeomFromText(:geom, 4326)),
CAST(:attr AS jsonb), :created_at);
"""),
{
"user_id": user_id,
"metadata_id": metadata_id,
"geom": geom_wkt,
"attr": json.dumps(attributes),
"created_at": datetime.utcnow(),
},
)
insert_count += 1
print(f"[INFO] Tabel '{table_name}' berhasil dibuat di PostGIS ({len(gdf)} baris).") # 🧩 4⃣ Membuat VIEW untuk dataset baru di QGIS
await create_dataset_view_from_metadata(conn, metadata_id, user_id, dataset_title)
print(f"[INFO] ✅ Berhasil memasukkan {insert_count} baris ke partisi user_id={user_id} (metadata_id={metadata_id}).")
return { return {
"table_name": table_name,
"status": "success", "status": "success",
"message": f"Tabel '{table_name}' berhasil diunggah ke PostGIS.", "user_id": user_id,
"total_rows": len(gdf), "metadata_id": metadata_id,
"geometry_type": list(gdf.geom_type.unique()) "dataset_title": dataset_title,
"inserted_rows": insert_count,
"geometry_type": list(gdf.geom_type.unique()),
} }
except Exception as e: except Exception as e:
print(f"[ERROR] Gagal upload ke PostGIS: {e}") print(f"[ERROR] Gagal upload ke PostGIS partition: {e}")
raise HTTPException(status_code=500, detail=str(e)) raise errorRes(status_code=500, message="Gagal upload ke PostGIS partition", details=str(e))

View File

@ -0,0 +1,9 @@
class PDFReadError(Exception):
"""Exception khusus untuk kesalahan saat membaca file PDF."""
def __init__(self, message: str, code: int = 400):
super().__init__(message)
self.message = message
self.code = code
def to_dict(self):
return {"error": self.message, "code": self.code}

View File

@ -0,0 +1,159 @@
import re
import itertools
geo_admin_keywords = [
'lat', 'lon', 'long', 'latitude', 'longitude', 'koordinat', 'geometry', 'geometri',
'desa', 'kelurahan', 'kel', 'kecamatan', 'kabupaten', 'kab', 'kota', 'provinsi',
'lokasi', 'region', 'area', 'zone', 'boundary', 'batas'
]
def normalize_text(text):
text = text.lower()
text = re.sub(r'[^a-z0-9/ ]+', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def generate_combined_patterns(keywords):
combos = list(itertools.combinations(keywords, 2))
patterns = []
for a, b in combos:
patterns.append(rf'{a}\s*/\s*{b}')
patterns.append(rf'{b}\s*/\s*{a}')
return patterns
combined_patterns = generate_combined_patterns(geo_admin_keywords)
def contains_geo_admin_keywords(text):
text_clean = normalize_text(text)
if len(text_clean) < 3:
return False
for pattern in combined_patterns:
if re.search(pattern, text_clean):
return True
for kw in geo_admin_keywords:
if re.search(rf'(^|[\s/_-]){kw}([\s/_-]|$)', text_clean):
return True
return False
def filter_geo_admin_column(tables):
filtered = []
for table in tables:
found = any(contains_geo_admin_keywords(col) for col in table['columns'])
if found:
filtered.append(table)
return filtered
NUMBER_HEADER_KEYWORDS = [
"no","no.","nomor","nomor urut","no urut","No","Nomor","No Urut","Index",
"ID","Sr No","S/N","SN","Sl No"
]
def has_number_header(header):
header_text = header
return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS)
def is_numbering_column(col_values):
numeric_like = 0
total = 0
for v in col_values:
if not v or not isinstance(v, str):
continue
total += 1
if re.fullmatch(r"0*\d{1,3}", v.strip()):
numeric_like += 1
return total > 0 and (numeric_like / total) > 0.6
def is_numeric_value(v):
if v is None:
return False
if isinstance(v, (int, float)):
return True
if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()):
return True
return False
def cleaning_column(headers, bodies):
cleaned_bodies = []
for header, body in zip(headers, bodies):
if not body:
cleaned_bodies.append(body)
continue
header_has_number = has_number_header(header)
first_col = [row[0] for row in body if row and len(row) > 0]
first_col_is_numbering = is_numbering_column(first_col)
if not header_has_number and first_col_is_numbering:
new_body = []
for row in body:
if not row:
continue
first_val = row[0]
if is_numeric_value(first_val) and len(row) > 1:
new_body.append(row[1:])
else:
new_body.append(row)
body = new_body
header_len = len(headers)
filtered_body = [row for row in body if len(row) == header_len]
cleaned_bodies.append(filtered_body)
return cleaned_bodies
def parse_page_selection(selectedPage: str, total_pages: int):
if not selectedPage:
return list(range(1, total_pages + 1))
pages = set()
parts = re.split(r'[,\s]+', selectedPage.strip())
for part in parts:
if '-' in part:
try:
start, end = map(int, part.split('-'))
pages.update(range(start, end + 1))
except ValueError:
continue
else:
try:
pages.add(int(part))
except ValueError:
continue
valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages]
return valid_pages
def is_number(s):
if s is None:
return False
s = str(s).strip().replace(',', '').replace('.', '')
return s.isdigit()
def row_ratio(row):
non_empty = [c for c in row if c not in (None, '', ' ')]
if not non_empty:
return 0
num_count = sum(is_number(c) for c in non_empty)
return num_count / len(non_empty)
def has_mixed_text_and_numbers(row):
non_empty = [c for c in row if c not in (None, '', ' ')]
has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty)
has_num = any(is_number(c) for c in non_empty)
return has_text and has_num
def is_short_text_row(row):
"""Deteksi baris teks pendek (1-2 kolom teks pendek)."""
non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')]
if not non_empty:
return False
text_only = all(not is_number(c) for c in non_empty)
joined = " ".join(non_empty)
return text_only and len(non_empty) <= 2 and len(joined) < 20

32
utils/logger_config.py Normal file
View File

@ -0,0 +1,32 @@
import logging
import os
LOG_DIR = "logs"
os.makedirs(LOG_DIR, exist_ok=True)
def setup_logger(name: str):
"""
Konfigurasi logger standar untuk seluruh service.
Format log:
[LEVEL] [Nama Modul] Pesan
"""
logger = logging.getLogger(name)
logger.setLevel(logging.INFO)
# Handler untuk menulis ke file
file_handler = logging.FileHandler(os.path.join(LOG_DIR, "app.log"))
file_handler.setLevel(logging.INFO)
# Handler untuk console (stdout)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter('[%(levelname)s] [%(name)s] %(message)s')
file_handler.setFormatter(formatter)
console_handler.setFormatter(formatter)
if not logger.handlers:
logger.addHandler(file_handler)
logger.addHandler(console_handler)
return logger

30
utils/qgis_init.py Normal file
View File

@ -0,0 +1,30 @@
# utils/qgis_init.py
import os
import sys
# Lokasi instalasi QGIS di Linux (Ubuntu / Debian)
QGIS_PREFIX = "/usr"
# Path modul Python QGIS
sys.path.append("/usr/share/qgis/python")
# Environment variable agar QGIS dapat berjalan headless (tanpa GUI)
os.environ["QGIS_PREFIX_PATH"] = QGIS_PREFIX
os.environ["QT_QPA_PLATFORM"] = "offscreen"
from qgis.core import QgsApplication
from qgis.analysis import QgsNativeAlgorithms
import processing
from processing.core.Processing import Processing
def init_qgis():
qgs = QgsApplication([], False)
qgs.initQgis()
# Register QGIS processing provider
Processing.initialize()
QgsApplication.processingRegistry().addProvider(QgsNativeAlgorithms())
print("QGIS initialized successfully")
return qgs