diff --git a/api/deps/__pycache__/auth_dependency.cpython-39.pyc b/api/deps/__pycache__/auth_dependency.cpython-39.pyc new file mode 100644 index 0000000..43986f3 Binary files /dev/null and b/api/deps/__pycache__/auth_dependency.cpython-39.pyc differ diff --git a/api/deps/__pycache__/role_dependency.cpython-39.pyc b/api/deps/__pycache__/role_dependency.cpython-39.pyc new file mode 100644 index 0000000..e10ee0e Binary files /dev/null and b/api/deps/__pycache__/role_dependency.cpython-39.pyc differ diff --git a/api/deps/auth_dependency.py b/api/deps/auth_dependency.py new file mode 100644 index 0000000..fbb1d0d --- /dev/null +++ b/api/deps/auth_dependency.py @@ -0,0 +1,34 @@ +from fastapi import Depends, Header +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select +from datetime import datetime +from response import errorRes + +from database.connection import SessionLocal +from database.models import User + +async def get_db(): + async with SessionLocal() as session: + yield session + + +async def get_current_user( + authorization: str = Header(None), + db: AsyncSession = Depends(get_db) +): + if not authorization or not authorization.startswith("Bearer "): + raise errorRes(status_code=401, message="Missing or invalid token") + + token = authorization.split(" ")[1] + result = await db.execute(select(User).where(User.active_token == token)) + user = result.scalar_one_or_none() + + # Case 1: Token not found → maybe replaced by new login + if not user: + raise errorRes(status_code=401, message="Token invalid or used by another login") + + # Case 2: Token expired + if user.token_expired_at and user.token_expired_at < datetime.utcnow(): + raise errorRes(status_code=401, message="Token expired") + + return user diff --git a/api/deps/role_dependency.py b/api/deps/role_dependency.py new file mode 100644 index 0000000..c45069a --- /dev/null +++ b/api/deps/role_dependency.py @@ -0,0 +1,20 @@ +from fastapi import Depends, status +from api.deps.auth_dependency import get_current_user +from response import errorRes + +def require_role(required_role: str): + """ + Return a dependency function that ensures the current user has a specific role. + Example usage: + @router.get("/admin", dependencies=[Depends(require_role("admin"))]) + """ + async def role_checker(user = Depends(get_current_user)): + if user.role != required_role: + raise errorRes( + status_code=status.HTTP_403_FORBIDDEN, + message="Access denied", + detail=f"Access denied: requires role '{required_role}'", + ) + return user + + return role_checker diff --git a/api/routers/__pycache__/auth_router.cpython-39.pyc b/api/routers/__pycache__/auth_router.cpython-39.pyc new file mode 100644 index 0000000..8a08548 Binary files /dev/null and b/api/routers/__pycache__/auth_router.cpython-39.pyc differ diff --git a/api/routers/__pycache__/system_router.cpython-39.pyc b/api/routers/__pycache__/system_router.cpython-39.pyc new file mode 100644 index 0000000..4ab7e1b Binary files /dev/null and b/api/routers/__pycache__/system_router.cpython-39.pyc differ diff --git a/api/routers/__pycache__/upload_file_router.cpython-39.pyc b/api/routers/__pycache__/upload_file_router.cpython-39.pyc new file mode 100644 index 0000000..c212b54 Binary files /dev/null and b/api/routers/__pycache__/upload_file_router.cpython-39.pyc differ diff --git a/api/routers/auth_router.py b/api/routers/auth_router.py new file mode 100644 index 0000000..8e8140e --- /dev/null +++ b/api/routers/auth_router.py @@ -0,0 +1,14 @@ +from fastapi import APIRouter, Depends +from pydantic import BaseModel +from sqlalchemy.ext.asyncio import AsyncSession +from services.auth.login import loginService, get_db + +router = APIRouter() + +class LoginRequest(BaseModel): + username: str + password: str + +@router.post("/login") +async def login(request: LoginRequest, db: AsyncSession = Depends(get_db)): + return await loginService(request.username, request.password, db) diff --git a/api/routers/datasets_router.py b/api/routers/datasets_router.py new file mode 100644 index 0000000..a88282f --- /dev/null +++ b/api/routers/datasets_router.py @@ -0,0 +1,20 @@ +from fastapi import APIRouter +from core.config import engine +from services.datasets.delete import delete_dataset_from_partition # import fungsi di atas +from response import successRes, errorRes + +router = APIRouter() + +@router.delete("/dataset/{user_id}/{metadata_id}") +async def delete_dataset(user_id: int, metadata_id: int, title: str): + """ + Hapus dataset tertentu (berdasarkan user_id dan metadata_id) + """ + try: + async with engine.begin() as conn: + await delete_dataset_from_partition(conn, user_id, metadata_id, title) + return successRes(message=f"Dataset {title} berhasil dihapus.", data="") + + except Exception as e: + print(f"[ERROR] Gagal hapus dataset: {e}") + raise errorRes(status_code=500, details=str(e), message="Gagal hapus dataset") diff --git a/routes/router.py b/api/routers/system_router.py similarity index 100% rename from routes/router.py rename to api/routers/system_router.py diff --git a/routes/upload_file_router.py b/api/routers/upload_file_router.py similarity index 63% rename from routes/upload_file_router.py rename to api/routers/upload_file_router.py index 9daadb6..dbcf773 100644 --- a/routes/upload_file_router.py +++ b/api/routers/upload_file_router.py @@ -1,13 +1,15 @@ -from fastapi import APIRouter, File, Form, UploadFile +from fastapi import APIRouter, File, Form, UploadFile, Depends from pydantic import BaseModel from typing import List, Optional from services.upload_file.upload import handle_upload_file, handle_process_pdf, handle_to_postgis +from api.deps.role_dependency import require_role +from database.connection import engine router = APIRouter() - @router.post("/file") +# async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form(""), user = Depends(require_role("admin"))): async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form("")): return await handle_upload_file(file, page, sheet) @@ -30,5 +32,5 @@ class UploadRequest(BaseModel): columns: List[str] @router.post("/to-postgis") -def upload_to_postgis(payload: UploadRequest): - return handle_to_postgis(payload) \ No newline at end of file +async def upload_to_postgis(payload: UploadRequest): + return await handle_to_postgis(payload, engine) \ No newline at end of file diff --git a/database/connection.py b/database/connection.py index 6903193..921a694 100644 --- a/database/connection.py +++ b/database/connection.py @@ -1,6 +1,9 @@ from sqlalchemy import create_engine +from sqlalchemy.ext.asyncio import create_async_engine from sqlalchemy.orm import sessionmaker +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker from core.config import POSTGIS_URL -engine = create_engine(POSTGIS_URL, pool_pre_ping=True) -SessionLocal = sessionmaker(bind=engine) +engine = create_async_engine(POSTGIS_URL, pool_pre_ping=True) +# SessionLocal = sessionmaker(bind=engine) +SessionLocal = async_sessionmaker(engine, expire_on_commit=False) diff --git a/database/models.py b/database/models.py index f848160..fa7c787 100644 --- a/database/models.py +++ b/database/models.py @@ -1,4 +1,5 @@ -from sqlalchemy import Column, Integer, String, Text, TIMESTAMP +from sqlalchemy import Column, Integer, String, Text, ForeignKey, DateTime, TIMESTAMP +from sqlalchemy.orm import relationship from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.sql import func @@ -14,3 +15,28 @@ class UploadLog(Base): uploaded_at = Column(TIMESTAMP, server_default=func.now()) status = Column(String) message = Column(Text) + + +class Institution(Base): + __tablename__ = "institutions" + + id = Column(Integer, primary_key=True, index=True) + name = Column(String(100), unique=True, nullable=False) + address = Column(String(200), nullable=True) + + users = relationship("User", back_populates="institution") + + +class User(Base): + __tablename__ = "users" + + id = Column(Integer, primary_key=True, index=True) + username = Column(String(50), unique=True, nullable=False) + password_hash = Column(String(255), nullable=False) + role = Column(String(50), nullable=False, default="user") # <── Added role + institution_id = Column(Integer, ForeignKey("institutions.id"), nullable=True) + active_token = Column(String(255), nullable=True) + token_expired_at = Column(DateTime, nullable=True) + last_login = Column(DateTime, nullable=True) + + institution = relationship("Institution", back_populates="users") \ No newline at end of file diff --git a/database/uploader.py b/database/uploader.py deleted file mode 100644 index da164ac..0000000 --- a/database/uploader.py +++ /dev/null @@ -1,16 +0,0 @@ -import geopandas as gpd -import pandas as pd -from database.connection import engine -from sqlalchemy import text - -def save_dataframe_dynamic(df: pd.DataFrame, table_name: str): - """Save pandas DataFrame to Postgres (non-geo).""" - df.to_sql(table_name, engine, if_exists="replace", index=False, method='multi', chunksize=1000) - -def save_geodataframe(gdf: gpd.GeoDataFrame, table_name: str): - """Save GeoDataFrame to PostGIS (requires geoalchemy/geopandas).""" - # ensure geometry column exists and CRS set - if gdf.crs is None: - gdf = gdf.set_crs("EPSG:4326", allow_override=True) - # geopandas >= 0.10 has to_postgis in some installs; fallback using SQLAlchemy + GeoAlchemy2: - gdf.to_postgis(table_name, engine, if_exists="replace") diff --git a/init_db.py b/init_db.py deleted file mode 100644 index 0702c1c..0000000 --- a/init_db.py +++ /dev/null @@ -1,3 +0,0 @@ -from database.connection import engine -from database.models import Base -Base.metadata.create_all(bind=engine) diff --git a/main.py b/main.py index 799ee8b..50a55e9 100644 --- a/main.py +++ b/main.py @@ -3,9 +3,11 @@ from fastapi.middleware.cors import CORSMiddleware from core.config import API_VERSION, ALLOWED_ORIGINS from database.connection import engine from database.models import Base -from routes.router import router as system_router -from routes.upload_file_router import router as upload_router -from routes.auth_router import router as auth_router +from api.routers.system_router import router as system_router +from api.routers.upload_file_router import router as upload_router +from api.routers.auth_router import router as auth_router +from contextlib import asynccontextmanager +from utils.qgis_init import init_qgis app = FastAPI( title="ETL Geo Upload Service", @@ -21,9 +23,37 @@ app.add_middleware( allow_headers=["*"], ) -Base.metadata.create_all(bind=engine) +# Base.metadata.create_all(bind=engine) + +# qgis setup +@asynccontextmanager +async def lifespan(app: FastAPI): + global qgs + qgs = init_qgis() + print("QGIS initialized") + + yield + + # SHUTDOWN (optional) + print("Shutting down...") + +app = FastAPI(lifespan=lifespan) + +@app.get("/qgis/status") +def qgis_status(): + try: + from qgis.core import Qgis + return { + "qgis_status": "connected", + "qgis_version": Qgis.QGIS_VERSION + } + except Exception as e: + return { + "qgis_status": "error", + "error": str(e) + } # Register routers app.include_router(system_router, tags=["System"]) app.include_router(auth_router, prefix="/auth", tags=["Auth"]) -app.include_router(upload_router, prefix="/upload", tags=["Upload"]) +app.include_router(upload_router, prefix="/upload", tags=["Upload"]) \ No newline at end of file diff --git a/response.py b/response.py new file mode 100644 index 0000000..3804dc3 --- /dev/null +++ b/response.py @@ -0,0 +1,22 @@ +from fastapi import HTTPException +from fastapi.responses import JSONResponse + +def successRes(data=None, message="Success", status_code=200): + return JSONResponse( + status_code=status_code, + content={ + "status": "success", + "message": message, + "data": data, + } + ) + +def errorRes(message="Error", status_code=400, details=None): + return HTTPException( + status_code=status_code, + detail={ + "status": "error", + "message": message, + "details": details + } + ) diff --git a/routes/auth_router.py b/routes/auth_router.py deleted file mode 100644 index 7896e26..0000000 --- a/routes/auth_router.py +++ /dev/null @@ -1,15 +0,0 @@ -from fastapi import APIRouter -from core.config import API_VERSION - -router = APIRouter() - -@router.get("/login") -async def login(): - return {"status": "success"} - - - - - - - diff --git a/services/.DS_Store b/services/.DS_Store new file mode 100644 index 0000000..9a03051 Binary files /dev/null and b/services/.DS_Store differ diff --git a/services/auth/auth.py b/services/auth/auth.py deleted file mode 100644 index e69de29..0000000 diff --git a/services/auth/login.py b/services/auth/login.py new file mode 100644 index 0000000..f0af30c --- /dev/null +++ b/services/auth/login.py @@ -0,0 +1,49 @@ +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select +from passlib.context import CryptContext +from uuid import uuid4 +from datetime import datetime, timedelta +from database.connection import SessionLocal +from database.models import User +from response import successRes, errorRes + +async def get_db(): + async with SessionLocal() as session: + yield session + +pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") + +async def loginService(username: str, password: str, db: AsyncSession): + result = await db.execute(select(User).where(User.username == username)) + user = result.scalar_one_or_none() + + if not user: + raise errorRes(status_code=401, message="Invalid username or password") + + # Verify password + if not pwd_context.verify(password, user.password_hash): + raise errorRes(status_code=401, message="Invalid username or password") + + # Validation for institution user + if user.role != "admin" and not user.institution_id: + raise errorRes(status_code=403, message="User must belong to an institution") + + # Generate single active token + token = str(uuid4()) + expiry = datetime.utcnow() + timedelta(hours=4) + + user.active_token = token + user.token_expired_at = expiry + user.last_login = datetime.utcnow() + await db.commit() + + res = { + "status": "success", + "username": user.username, + "role": user.role, + "institution_id": user.institution_id, + "token": token, + "token_expired_at": expiry.isoformat() + } + + return successRes(message="Success Login", data=res) diff --git a/services/datasets/delete.py b/services/datasets/delete.py new file mode 100644 index 0000000..4763090 --- /dev/null +++ b/services/datasets/delete.py @@ -0,0 +1,33 @@ +import re +from sqlalchemy import text + +def slugify(value: str): + return re.sub(r'[^a-zA-Z0-9]+', '_', value.lower()).strip('_') + + +async def delete_dataset_from_partition(conn, user_id: int, metadata_id: int, title: str): + """ + Menghapus dataset tertentu pada partisi user tertentu. + - Hapus semua record di partisi (test_partition_user_{id}) + - Hapus metadata dari dataset_metadata + - Hapus VIEW QGIS terkait + """ + base_table = f"test_partition_user_{user_id}" + norm_title = slugify(title) + view_name = f"v_user_{user_id}_{norm_title}" + + print(f"[INFO] Menghapus dataset metadata_id={metadata_id} milik user_id={user_id}...") + + # 1️⃣ Hapus data spasial dari partisi + await conn.execute(text(f"DELETE FROM {base_table} WHERE metadata_id = :mid;"), {"mid": metadata_id}) + print(f"[INFO] Data spasial dari partisi {base_table} (metadata_id={metadata_id}) dihapus.") + + # 2️⃣ Hapus metadata dari dataset_metadata + await conn.execute(text("DELETE FROM dataset_metadata WHERE id = :mid;"), {"mid": metadata_id}) + print(f"[INFO] Metadata dataset id={metadata_id} dihapus dari tabel dataset_metadata.") + + # 3️⃣ Hapus view terkait di QGIS + await conn.execute(text(f"DROP VIEW IF EXISTS {view_name} CASCADE;")) + print(f"[INFO] View {view_name} dihapus (jika ada).") + + print("[INFO] ✅ Penghapusan dataset selesai.") diff --git a/services/upload_file/.DS_Store b/services/upload_file/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/services/upload_file/.DS_Store differ diff --git a/services/upload_file/read_csv/reader_csv.py b/services/upload_file/read_csv/reader_csv.py deleted file mode 100644 index 1c66c78..0000000 --- a/services/upload_file/read_csv/reader_csv.py +++ /dev/null @@ -1,283 +0,0 @@ -# import pandas as pd -# import re -# import csv -# import os - -# def detect_header_line(path, max_rows=10): -# with open(path, 'r', encoding='utf-8', errors='ignore') as f: -# lines = [next(f) for _ in range(max_rows)] - -# header_line_idx = 0 -# best_score = -1 - -# for i, line in enumerate(lines): -# cells = re.split(r'[;,|\t]', line.strip()) -# alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1) -# digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1) -# score = alpha_ratio - digit_ratio - -# if score > best_score: -# best_score = score -# header_line_idx = i - -# return header_line_idx - - -# def detect_delimiter(path, sample_size=2048): -# with open(path, 'r', encoding='utf-8', errors='ignore') as f: -# sample = f.read(sample_size) -# sniffer = csv.Sniffer() -# try: -# dialect = sniffer.sniff(sample) -# return dialect.delimiter -# except Exception: -# for delim in [',', ';', '\t', '|']: -# if delim in sample: -# return delim -# return ',' - - -# def read_csv(path: str): -# ext = os.path.splitext(path)[1].lower() # ambil ekstensi file - -# try: -# if ext in ['.csv', '.txt']: -# # === Baca file CSV === -# header_line = detect_header_line(path) -# delimiter = detect_delimiter(path) -# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") - -# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',') - -# elif ext in ['.xlsx', '.xls']: -# # === Baca file Excel === -# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") -# pre_df = pd.read_excel(path, header=0, dtype=str) # baca semua sebagai string -# df = pre_df.copy() -# for col in df.columns: -# if df[col].str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any(): -# df[col] = df[col].str.replace(',', '', regex=False) -# df[col] = pd.to_numeric(df[col], errors='ignore') - -# else: -# raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)") - -# except Exception as e: -# print(f"[WARN] Gagal membaca file ({e}), fallback ke default") -# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',') - -# # Bersihkan kolom dan baris kosong -# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')] -# df.columns = [str(c).strip() for c in df.columns] -# df = df.dropna(how='all') - -# return df - - - - - - - - - - - - -import pandas as pd -import re -import csv -import os - -def detect_header_line(path, max_rows=10): - with open(path, 'r', encoding='utf-8', errors='ignore') as f: - lines = [next(f) for _ in range(max_rows)] - header_line_idx = 0 - best_score = -1 - for i, line in enumerate(lines): - cells = re.split(r'[;,|\t]', line.strip()) - alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1) - digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1) - score = alpha_ratio - digit_ratio - if score > best_score: - best_score = score - header_line_idx = i - return header_line_idx - -def detect_delimiter(path, sample_size=2048): - with open(path, 'r', encoding='utf-8', errors='ignore') as f: - sample = f.read(sample_size) - sniffer = csv.Sniffer() - try: - dialect = sniffer.sniff(sample) - return dialect.delimiter - except Exception: - for delim in [',', ';', '\t', '|']: - if delim in sample: - return delim - return ',' - -# def read_csv(path: str): -# ext = os.path.splitext(path)[1].lower() - -# try: -# if ext in ['.csv']: -# # === Baca file CSV === -# header_line = detect_header_line(path) -# delimiter = detect_delimiter(path) -# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") - -# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',') - -# elif ext in ['.xlsx', '.xls']: -# # === Baca file Excel === -# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") -# xls = pd.ExcelFile(path) - -# print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}") - -# # Evaluasi tiap sheet untuk mencari yang paling relevan -# best_sheet = None -# best_score = -1 -# best_df = None - -# for sheet_name in xls.sheet_names: -# try: -# df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str) -# df = df.dropna(how='all').dropna(axis=1, how='all') - -# if len(df) == 0 or len(df.columns) < 2: -# continue - -# # hitung "skor relevansi" -# text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1) -# row_score = len(df) -# score = (row_score * 0.7) + (text_ratio * 100) - -# if score > best_score: -# best_score = score -# best_sheet = sheet_name -# best_df = df - -# except Exception as e: -# print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}") -# continue - -# if best_df is not None: -# print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}") -# df = best_df -# else: -# raise ValueError("Tidak ada sheet valid yang dapat dibaca.") - -# # Konversi tipe numerik jika ada -# for col in df.columns: -# if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any(): -# df[col] = df[col].astype(str).str.replace(',', '', regex=False) -# df[col] = pd.to_numeric(df[col], errors='ignore') - -# else: -# raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)") - -# except Exception as e: -# print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.") -# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',') - -# # Bersihkan kolom dan baris kosong -# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')] -# df.columns = [str(c).strip() for c in df.columns] -# df = df.dropna(how='all') - -# return df - - - - - -def read_csv(path: str, sheet: str = None): - ext = os.path.splitext(path)[1].lower() - - try: - if ext in ['.csv']: - # === Baca file CSV === - header_line = detect_header_line(path) - delimiter = detect_delimiter(path) - print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") - - df = pd.read_csv( - path, - header=header_line, - sep=delimiter, - encoding='utf-8', - low_memory=False, - thousands=',' - ) - - elif ext in ['.xlsx', '.xls']: - # === Baca file Excel === - print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") - xls = pd.ExcelFile(path) - print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}") - - # === Jika user memberikan nama sheet === - if sheet: - if sheet not in xls.sheet_names: - raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}") - print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'") - df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str) - df = df.dropna(how='all').dropna(axis=1, how='all') - - else: - # === Auto-detect sheet terbaik === - print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...") - best_sheet = None - best_score = -1 - best_df = None - - for sheet_name in xls.sheet_names: - try: - temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str) - temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all') - - if len(temp_df) == 0 or len(temp_df.columns) < 2: - continue - - # hitung skor relevansi - text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1) - row_score = len(temp_df) - score = (row_score * 0.7) + (text_ratio * 100) - - if score > best_score: - best_score = score - best_sheet = sheet_name - best_df = temp_df - - except Exception as e: - print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}") - continue - - if best_df is not None: - print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}") - df = best_df - else: - raise ValueError("Tidak ada sheet valid yang dapat dibaca.") - - # Konversi tipe numerik jika ada - for col in df.columns: - if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any(): - df[col] = df[col].astype(str).str.replace(',', '', regex=False) - df[col] = pd.to_numeric(df[col], errors='ignore') - - else: - raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)") - - except Exception as e: - print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.") - df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',') - - # Bersihkan kolom dan baris kosong - df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')] - df.columns = [str(c).strip() for c in df.columns] - df = df.dropna(how='all') - - return df - diff --git a/services/upload_file/read_pdf/filter_column.py b/services/upload_file/read_pdf/filter_column.py deleted file mode 100644 index b814503..0000000 --- a/services/upload_file/read_pdf/filter_column.py +++ /dev/null @@ -1,47 +0,0 @@ -import re -import itertools - -geo_admin_keywords = [ - 'lat', 'lon', 'long', 'latitude', 'longitude', 'koordinat', 'geometry', 'geometri', - 'desa', 'kelurahan', 'kel', 'kecamatan', 'kabupaten', 'kab', 'kota', 'provinsi', - 'lokasi', 'region', 'area', 'zone', 'boundary', 'batas' -] - -def normalize_text(text): - text = text.lower() - text = re.sub(r'[^a-z0-9/ ]+', ' ', text) - text = re.sub(r'\s+', ' ', text).strip() - return text - -def generate_combined_patterns(keywords): - combos = list(itertools.combinations(keywords, 2)) - patterns = [] - for a, b in combos: - patterns.append(rf'{a}\s*/\s*{b}') - patterns.append(rf'{b}\s*/\s*{a}') - return patterns - -combined_patterns = generate_combined_patterns(geo_admin_keywords) - -def contains_geo_admin_keywords(text): - text_clean = normalize_text(text) - if len(text_clean) < 3: - return False - - for pattern in combined_patterns: - if re.search(pattern, text_clean): - return True - - for kw in geo_admin_keywords: - if re.search(rf'(^|[\s/_-]){kw}([\s/_-]|$)', text_clean): - return True - - return False - -def filter_geo_admin_column(tables): - filtered = [] - for table in tables: - found = any(contains_geo_admin_keywords(col) for col in table['columns']) - if found: - filtered.append(table) - return filtered diff --git a/services/upload_file/read_pdf/reader_pdf.py b/services/upload_file/read_pdf/reader_pdf.py deleted file mode 100644 index 0f31264..0000000 --- a/services/upload_file/read_pdf/reader_pdf.py +++ /dev/null @@ -1,270 +0,0 @@ -import pdfplumber -import re -import pandas as pd -from services.upload_file.read_pdf.filter_column import filter_geo_admin_column - -def is_number(s): - if s is None: - return False - s = str(s).strip().replace(',', '').replace('.', '') - return s.isdigit() - -def row_ratio(row): - non_empty = [c for c in row if c not in (None, '', ' ')] - if not non_empty: - return 0 - num_count = sum(is_number(c) for c in non_empty) - return num_count / len(non_empty) - -def has_mixed_text_and_numbers(row): - non_empty = [c for c in row if c not in (None, '', ' ')] - has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty) - has_num = any(is_number(c) for c in non_empty) - return has_text and has_num - -def is_short_text_row(row): - """Deteksi baris teks pendek (1-2 kolom teks pendek).""" - non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')] - if not non_empty: - return False - text_only = all(not is_number(c) for c in non_empty) - joined = " ".join(non_empty) - return text_only and len(non_empty) <= 2 and len(joined) < 20 - -def detect_header_rows(rows): - if not rows: - return [] - - ratios = [row_ratio(r) for r in rows] - body_start_index = None - - for i in range(1, len(rows)): - row = rows[i] - if has_mixed_text_and_numbers(row): - body_start_index = i - break - if ratios[i] > 0.3: - body_start_index = i - break - if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row): - body_start_index = i - break - if ratios[i - 1] == 0 and ratios[i] > 0: - body_start_index = i - break - - if body_start_index is None: - body_start_index = len(rows) - - potential_headers = rows[:body_start_index] - body_filtered = rows[body_start_index:] - header_filtered = [] - for idx, row in enumerate(potential_headers): - if is_short_text_row(row): - if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0: - header_filtered.append(row) - else: - continue - else: - header_filtered.append(row) - - return header_filtered, body_filtered - - -def merge_multiline_header(header_rows): - final_header = [] - for col in zip(*header_rows): - val = next((v for v in reversed(col) if v and str(v).strip()), '') - val = str(val).replace('\n', ' ').strip() - final_header.append(val) - final_header = [v for v in final_header if v not in ['', None]] - return final_header - - - -NUMBER_HEADER_KEYWORDS = ["no","no.","no .","no . ","no :","no : ","nomor","nomor.","nomor :","nomor urut","no urut","no. urut","no-urut","no_urut","nomor_urut","nomor-urut","No","NO","NO.","No.","No :","NO :","Nomor","NOMOR","Nomor Urut","NOMOR URUT","No Urut","NO URUT","No. Urut","NO. URUT","No /","No / ","No / Nama","No -","No - ","Nomor /","Nomor -","Number","No. of","No of","Index","Serial","Order","ID","ID No","ID No.","Sr No","Sr. No","S/N","SN","Sl No","Sl. No","N0","N0.","N0 :","NOM0R","NOM0R URUT","N0MOR",] - -def has_number_header(header): - """Periksa apakah header mengandung kolom No/Nomor.""" - header_text = header - return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS) - -def is_numbering_column(col_values): - """Periksa apakah kolom pertama diisi nomor urut seperti 1, 01, 2, dst.""" - numeric_like = 0 - total = 0 - for v in col_values: - if not v or not isinstance(v, str): - continue - total += 1 - if re.fullmatch(r"0*\d{1,3}", v.strip()): - numeric_like += 1 - return total > 0 and (numeric_like / total) > 0.6 - -def is_numeric_value(v): - """Cek apakah suatu nilai termasuk angka (int, float, atau string angka).""" - if v is None: - return False - if isinstance(v, (int, float)): - return True - if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()): - return True - return False - -def cleaning_column(headers, bodies): - cleaned_bodies = [] - - for header, body in zip(headers, bodies): - if not body: - cleaned_bodies.append(body) - continue - - header_has_number = has_number_header(header) - first_col = [row[0] for row in body if row and len(row) > 0] - first_col_is_numbering = is_numbering_column(first_col) - - if not header_has_number and first_col_is_numbering: - new_body = [] - for row in body: - if not row: - continue - first_val = row[0] - if is_numeric_value(first_val) and len(row) > 1: - new_body.append(row[1:]) - else: - new_body.append(row) - body = new_body - - header_len = len(headers) - filtered_body = [row for row in body if len(row) == header_len] - - cleaned_bodies.append(filtered_body) - - return cleaned_bodies - - - - -def parse_page_selection(selectedPage: str, total_pages: int): - if not selectedPage: - return list(range(1, total_pages + 1)) - - pages = set() - parts = re.split(r'[,\s]+', selectedPage.strip()) - - for part in parts: - if '-' in part: - try: - start, end = map(int, part.split('-')) - pages.update(range(start, end + 1)) - except ValueError: - continue - else: - try: - pages.add(int(part)) - except ValueError: - continue - - valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages] - return valid_pages - - - -def read_pdf(path: str, page: str): - pdf_path = path - selectedPage = None - # if page == '' or None: - # selectedPage = "1" - if not page: - selectedPage = "1" - else: - selectedPage = page - tables_data = [] - with pdfplumber.open(pdf_path) as pdf: - total_pages = len(pdf.pages) - selected_pages = parse_page_selection(selectedPage, total_pages) - - print(f"[INFO] Total halaman PDF: {total_pages}") - print(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}") - - for page_num in selected_pages: - pdf_page = pdf.pages[page_num - 1] # index pdfplumber mulai dari 0 - tables = pdf_page.find_tables() - print(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi") - - for t in tables: - table = t.extract() - if len(table) > 2: - tables_data.append(table) - - print(f"\nTotal tabel valid: {len(tables_data)}\n") - - header_only = [] - body_only = [] - for tbl in tables_data: - head, body = detect_header_rows(tbl) - header_only.append(head) - body_only.append(body) - - clean_header = [] - for h in header_only: - clean_header.append(merge_multiline_header(h)) - - clean_body=[] - for i, raw_body in enumerate(body_only): - con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body] - cleaned = cleaning_column(clean_header[i], [con_body]) - clean_body.append(cleaned[0]) - - parsed = [] - for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1): - parsed.append({ - "title": str(i), - "columns": cols, - "rows": rows - }) - - clean_parsed = filter_geo_admin_column(parsed) - # print(f"parsed{clean_parsed}") - return clean_parsed - - - - - - -def convert_df(payload): - if "columns" not in payload or "rows" not in payload: - raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.") - - if not isinstance(payload["columns"], list): - raise TypeError("'columns' harus berupa list.") - if not isinstance(payload["rows"], list): - raise TypeError("'rows' harus berupa list.") - - for i, row in enumerate(payload["rows"]): - if len(row) != len(payload["columns"]): - raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.") - - df = pd.DataFrame(payload["rows"], columns=payload["columns"]) - - if "title" in payload: - df.attrs["title"] = payload["title"] - - return df - - - - - - - - -def test_read_pdf(): - # single - # parsed = [{'title': 'Tabel 3.49. Potensi Penduduk Terpapar Bencana Banjir di Provinsi Jawa Timur', 'columns': ['No', 'Kabupaten/Kota', 'Jumlah Penduduk Terpapar (Jiwa)', 'Penduduk Umur Rentan', 'Penduduk Miskin', 'Penduduk Disabilitas', 'Kelas'], 'rows': [['1', 'PACITAN', '111.309', '14.142', '9.307', '781', 'SEDANG'], ['2', 'PONOROGO', '381.579', '50.815', '44.256', '2.346', 'SEDANG'], ['3', 'TRENGGALEK', '284.509', '34.304', '33.653', '1.945', 'SEDANG'], ['4', 'TULUNGAGUNG', '777.174', '86.452', '67.952', '3.200', 'SEDANG'], ['5', 'BLITAR', '226.767', '25.032', '22.554', '909', 'SEDANG'], ['6', 'KEDIRI', '545.961', '59.272', '74.578', '2.539', 'SEDANG'], ['7', 'MALANG', '238.170', '23.646', '25.388', '641', 'SEDANG'], ['8', 'LUMAJANG', '267.926', '30.206', '33.738', '970', 'SEDANG'], ['9', 'JEMBER', '1.061.703', '109.355', '105.958', '2.424', 'SEDANG'], ['10', 'BANYUWANGI', '442.290', '51.294', '44.107', '1.168', 'SEDANG'], ['11', 'BONDOWOSO', '143.452', '18.178', '21.676', '517', 'SEDANG'], ['12', 'SITUBONDO', '233.211', '26.799', '54.221', '928', 'SEDANG'], ['13', 'PROBOLINGGO', '326.005', '37.002', '58.562', '1.323', 'SEDANG'], ['14', 'PASURUAN', '485.143', '49.285', '65.076', '1.576', 'SEDANG'], ['15', 'SIDOARJO', '1.930.615', '172.191', '132.673', '3.987', 'SEDANG'], ['16', 'MOJOKERTO', '498.583', '52.453', '49.831', '1.491', 'SEDANG'], ['17', 'JOMBANG', '876.937', '92.415', '107.447', '4.985', 'SEDANG'], ['18', 'NGANJUK', '829.022', '95.454', '117.127', '3.029', 'SEDANG'], ['19', 'MADIUN', '363.763', '44.997', '44.877', '1.695', 'SEDANG'], ['20', 'MAGETAN', '117.247', '15.706', '11.051', '652', 'SEDANG'], ['21', 'NGAWI', '419.065', '49.864', '65.877', '1.572', 'SEDANG'], ['22', 'BOJONEGORO', '910.377', '100.800', '117.977', '3.557', 'SEDANG'], ['23', 'TUBAN', '507.407', '51.775', '60.834', '2.206', 'SEDANG'], ['24', 'LAMONGAN', '884.503', '99.928', '96.031', '3.960', 'SEDANG'], ['25', 'GRESIK', '613.133', '59.848', '49.854', '1.666', 'SEDANG'], ['26', 'BANGKALAN', '312.149', '31.075', '36.099', '1.169', 'SEDANG'], ['27', 'SAMPANG', '239.656', '28.756', '39.790', '1.280', 'SEDANG'], ['28', 'PAMEKASAN', '216.423', '25.831', '30.296', '776', 'SEDANG'], ['29', 'SUMENEP', '217.805', '24.741', '33.293', '1.088', 'SEDANG'], ['1', 'KOTA KEDIRI', '162.064', '17.129', '13.997', '363', 'SEDANG'], ['2', 'KOTA BLITAR', '21.390', '2.242', '1.185', '79', 'SEDANG'], ['3', 'KOTA MALANG', '148.072', '15.499', '6.142', '201', 'SEDANG'], ['4', 'KOTA PROBOLINGGO', '117.911', '12.708', '10.913', '420', 'SEDANG'], ['5', 'KOTA PASURUAN', '199.602', '20.199', '19.721', '516', 'SEDANG'], ['6', 'KOTA MOJOKERTO', '139.962', '14.486', '6.971', '584', 'SEDANG'], ['7', 'KOTA MADIUN', '149.468', '17.255', '6.300', '304', 'SEDANG'], ['8', 'KOTA SURABAYA', '2.469.639', '244.061', '133.953', '3.838', 'SEDANG'], ['9', 'KOTA BATU', '8.858', '939', '529', '13', 'SEDANG'], ['-', 'Provinsi Jawa Timur', '17.878.850', '1.906.134', '1.853.794', '60.698', 'SEDANG']]}] - - # double - parsed = [{"title":"Luas Catchment Area (km2) Pada Wilayah Sungai di Provinsi Jawa Timur","columns":["Wilayah Sungai","Luas (km2)","Jumlah DAS"],"rows":[["Bengawan Solo","13.070,00","94 DAS"],["Brantas","13.880,00","20 DAS"],["Welang -Rejoso","2.601,00","36 DAS"],["Pekalen -Sampean","3.953,00","56 DAS"],["Baru -Bajulmati","3.675,00","60 DAS"],["Bondoyudo -Bedadung","5.364,00","47 DAS"],["Madura","4.575,00","173 DAS"]]},{"title":"Jumlah dan Kepadatan Penduduk Menurut Kabupaten\/kota di Provinsi Jawa Timur Tahun 2021","columns":["Kabupaten\/Kota","Jumlah Penduduk","Persentase","Kepadatan Penduduk (Jiwa per Km2)"],"rows":[["Bangkalan","1.082.759","2,64","1.081,20"],["Banyuwangi","1.749.773","4,27","302,60"],["Blitar","1.228.292","3,00","919,05"],["Bojonegoro","1.343.895","3,28","611,20"],["Bondowoso","801.541","1,96","525,27"],["Gresik","1.283.961","3,13","1.077,83"],["Jember","2.581.486","6,30","834,80"],["Jombang","1.350.483","3,29","1.211,10"],["Kediri","1.671.821","4,08","1.206,18"],["Lamongan","1.379.731","3,37","774,24"],["Lumajang","1.091.856","2,66","609,67"],["Madiun","754.263","1,84","726,94"],["Magetan","689.369","1,68","1.000,77"],["Malang","2.611.907","6,37","739,78"],["Mojokerto","1.126.540","2,75","1.569,37"],["Nganjuk","1.133.556","2,77","925,92"],["Ngawi","896.768","2,19","691,96"],["Pacitan","597.580","1,46","429,94"],["Pamekasan","840.790","2,05","1.061,28"],["Pasuruan","1.603.754","3,91","1.088,01"],["Ponorogo","968.681","2,36","741,89"],["Probolinggo","1.156.570","2,82","681,86"],["Sampang","902.514","2,20","731,92"],["Sidoarjo","1.951.723","4,76","3.076,58"],["Situbondo","666.245","1,63","398,98"],["Sumenep","1.134.750","2,77","567,79"],["Trenggalek","746.734","1,82","650,91"],["Tuban","1.223.257","2,98","666,93"],["Tulungagung","1.126.679","2,75","1.067,28"],["Kota Batu","215.248","0,53","1.574,14"],["Kota Blitar","158.123","0,39","4.854,87"],["Kota Kediri","292.363","0,71","4.611,40"],["Kota Madiun","201.243","0,49","6.045,15"],["Kota Malang","866.356","2,11","5.963,35"],["Kota Mojokerto","139.961","0,34","8.497,94"],["Kota Pasuruan","210.341","0,51","5.960,36"],["Kota Probolinggo","242.246","0,59","4.274,68"],["Kota Surabaya","2.970.843","7,25","8.475,05"],["Provinsi Jawa Timur","40.994.002","100,00","76.228,17"]]}] - # df = convert_df(parsed, table_index=0) - return parsed \ No newline at end of file diff --git a/services/upload_file/readers/__pycache__/reader_csv.cpython-39.pyc b/services/upload_file/readers/__pycache__/reader_csv.cpython-39.pyc new file mode 100644 index 0000000..2891219 Binary files /dev/null and b/services/upload_file/readers/__pycache__/reader_csv.cpython-39.pyc differ diff --git a/services/upload_file/readers/__pycache__/reader_gdb.cpython-39.pyc b/services/upload_file/readers/__pycache__/reader_gdb.cpython-39.pyc new file mode 100644 index 0000000..935c85f Binary files /dev/null and b/services/upload_file/readers/__pycache__/reader_gdb.cpython-39.pyc differ diff --git a/services/upload_file/readers/__pycache__/reader_mpk.cpython-39.pyc b/services/upload_file/readers/__pycache__/reader_mpk.cpython-39.pyc new file mode 100644 index 0000000..b9c3817 Binary files /dev/null and b/services/upload_file/readers/__pycache__/reader_mpk.cpython-39.pyc differ diff --git a/services/upload_file/readers/__pycache__/reader_pdf.cpython-39.pyc b/services/upload_file/readers/__pycache__/reader_pdf.cpython-39.pyc new file mode 100644 index 0000000..4c165af Binary files /dev/null and b/services/upload_file/readers/__pycache__/reader_pdf.cpython-39.pyc differ diff --git a/services/upload_file/readers/__pycache__/reader_shp.cpython-39.pyc b/services/upload_file/readers/__pycache__/reader_shp.cpython-39.pyc new file mode 100644 index 0000000..0d13461 Binary files /dev/null and b/services/upload_file/readers/__pycache__/reader_shp.cpython-39.pyc differ diff --git a/services/upload_file/readers/reader_csv.py b/services/upload_file/readers/reader_csv.py new file mode 100644 index 0000000..b958ee9 --- /dev/null +++ b/services/upload_file/readers/reader_csv.py @@ -0,0 +1,116 @@ +import pandas as pd +import re +import csv +import os + +def detect_header_line(path, max_rows=10): + with open(path, 'r', encoding='utf-8', errors='ignore') as f: + lines = [next(f) for _ in range(max_rows)] + header_line_idx = 0 + best_score = -1 + for i, line in enumerate(lines): + cells = re.split(r'[;,|\t]', line.strip()) + alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1) + digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1) + score = alpha_ratio - digit_ratio + if score > best_score: + best_score = score + header_line_idx = i + return header_line_idx + +def detect_delimiter(path, sample_size=2048): + with open(path, 'r', encoding='utf-8', errors='ignore') as f: + sample = f.read(sample_size) + sniffer = csv.Sniffer() + try: + dialect = sniffer.sniff(sample) + return dialect.delimiter + except Exception: + for delim in [',', ';', '\t', '|']: + if delim in sample: + return delim + return ',' + + +def read_csv(path: str, sheet: str = None): + ext = os.path.splitext(path)[1].lower() + + try: + if ext in ['.csv']: + header_line = detect_header_line(path) + delimiter = detect_delimiter(path) + print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") + + df = pd.read_csv( + path, + header=header_line, + sep=delimiter, + encoding='utf-8', + low_memory=False, + thousands=',' + ) + + elif ext in ['.xlsx', '.xls']: + print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") + xls = pd.ExcelFile(path) + print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}") + + if sheet: + if sheet not in xls.sheet_names: + raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}") + print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'") + df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str) + df = df.dropna(how='all').dropna(axis=1, how='all') + + else: + print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...") + best_sheet = None + best_score = -1 + best_df = None + + for sheet_name in xls.sheet_names: + try: + temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str) + temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all') + + if len(temp_df) == 0 or len(temp_df.columns) < 2: + continue + + # hitung skor relevansi + text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1) + row_score = len(temp_df) + score = (row_score * 0.7) + (text_ratio * 100) + + if score > best_score: + best_score = score + best_sheet = sheet_name + best_df = temp_df + + except Exception as e: + print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}") + continue + + if best_df is not None: + print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}") + df = best_df + else: + raise ValueError("Tidak ada sheet valid yang dapat dibaca.") + + for col in df.columns: + if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any(): + df[col] = df[col].astype(str).str.replace(',', '', regex=False) + df[col] = pd.to_numeric(df[col], errors='ignore') + + else: + raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)") + + except Exception as e: + print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.") + df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',') + + df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')] + df.columns = [str(c).strip() for c in df.columns] + df = df.dropna(how='all') + + return df + diff --git a/services/upload_file/read_gdb/reader_gdb.py b/services/upload_file/readers/reader_gdb.py similarity index 100% rename from services/upload_file/read_gdb/reader_gdb.py rename to services/upload_file/readers/reader_gdb.py diff --git a/services/upload_file/read_mpk/reader_mpk.py b/services/upload_file/readers/reader_mpk.py similarity index 100% rename from services/upload_file/read_mpk/reader_mpk.py rename to services/upload_file/readers/reader_mpk.py diff --git a/services/upload_file/readers/reader_pdf.py b/services/upload_file/readers/reader_pdf.py new file mode 100644 index 0000000..972927f --- /dev/null +++ b/services/upload_file/readers/reader_pdf.py @@ -0,0 +1,168 @@ +import re +import pdfplumber +import pandas as pd +from services.upload_file.utils.pdf_cleaner import row_ratio, has_mixed_text_and_numbers, is_short_text_row, parse_page_selection, filter_geo_admin_column, cleaning_column +from services.upload_file.upload_exceptions import PDFReadError +from utils.logger_config import setup_logger + +logger = setup_logger(__name__) + +def detect_header_rows(rows): + if not rows: + return [] + + ratios = [row_ratio(r) for r in rows] + body_start_index = None + + for i in range(1, len(rows)): + row = rows[i] + if has_mixed_text_and_numbers(row): + body_start_index = i + break + if ratios[i] > 0.3: + body_start_index = i + break + if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row): + body_start_index = i + break + if ratios[i - 1] == 0 and ratios[i] > 0: + body_start_index = i + break + + if body_start_index is None: + body_start_index = len(rows) + + potential_headers = rows[:body_start_index] + body_filtered = rows[body_start_index:] + header_filtered = [] + for idx, row in enumerate(potential_headers): + if is_short_text_row(row): + if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0: + header_filtered.append(row) + else: + continue + else: + header_filtered.append(row) + + return header_filtered, body_filtered + + +def merge_multiline_header(header_rows): + final_header = [] + for col in zip(*header_rows): + val = next((v for v in reversed(col) if v and str(v).strip()), '') + val = str(val).replace('\n', ' ').strip() + final_header.append(val) + final_header = [v for v in final_header if v not in ['', None]] + return final_header + + +def read_pdf(path: str, page: str): + """ + Membaca tabel dari file PDF secara semi-otomatis menggunakan `pdfplumber`. + + Alur utama proses: + 1. **Buka file PDF** menggunakan pdfplumber. + 2. **Pilih halaman** berdasarkan input `page` (misalnya "1,3-5" untuk halaman 1 dan 3–5). + 3. **Deteksi tabel** di setiap halaman yang dipilih. + 4. **Ekstraksi tabel mentah** (list of list) dari setiap halaman. + 5. **Pisahkan baris header dan body** dengan fungsi `detect_header_rows()`. + 6. **Gabungkan header multi-baris** (misalnya tabel dengan dua baris judul kolom). + 7. **Bersihkan body tabel** menggunakan `cleaning_column()`: + - Menghapus kolom nomor urut. + - Menyesuaikan jumlah kolom dengan header. + 8. **Gabungkan hasil akhir** ke dalam format JSON dengan struktur: + { + "title": , + "columns": [...], + "rows": [...] + } + 9. **Filter tambahan** dengan `filter_geo_admin_column()` (khusus metadata geospasial). + 10. **Kembalikan hasil** berupa list JSON siap dikirim ke frontend API. + + Args: + path (str): Lokasi file PDF yang akan dibaca. + page (str): Nomor halaman atau rentang halaman, contoh: "1", "2-4", "1,3-5". + + Returns: + list[dict]: Daftar tabel hasil ekstraksi dengan struktur kolom dan baris. + + Raises: + PDFReadError: Jika terjadi kesalahan saat membaca atau parsing PDF. + """ + try: + pdf_path = path + selectedPage = page if page else "1" + tables_data = [] + + with pdfplumber.open(pdf_path) as pdf: + total_pages = len(pdf.pages) + selected_pages = parse_page_selection(selectedPage, total_pages) + + logger.info(f"[INFO] Total halaman PDF: {total_pages}") + logger.info(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}") + + for page_num in selected_pages: + pdf_page = pdf.pages[page_num - 1] + tables = pdf_page.find_tables() + logger.info(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi") + + for t in tables: + table = t.extract() + if len(table) > 2: + tables_data.append(table) + + logger.info(f"\nTotal tabel valid: {len(tables_data)}\n") + + header_only, body_only = [], [] + for tbl in tables_data: + head, body = detect_header_rows(tbl) + header_only.append(head) + body_only.append(body) + + clean_header = [merge_multiline_header(h) for h in header_only] + clean_body = [] + + for i, raw_body in enumerate(body_only): + con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body] + cleaned = cleaning_column(clean_header[i], [con_body]) + clean_body.append(cleaned[0]) + + parsed = [] + for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1): + parsed.append({ + "title": str(i), + "columns": cols, + "rows": rows + }) + + clean_parsed = filter_geo_admin_column(parsed) + return clean_parsed + + except Exception as e: + raise PDFReadError(f"Gagal membaca PDF: {e}", code=422) + + +def convert_df(payload): + try: + if "columns" not in payload or "rows" not in payload: + raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.") + + if not isinstance(payload["columns"], list): + raise TypeError("'columns' harus berupa list.") + if not isinstance(payload["rows"], list): + raise TypeError("'rows' harus berupa list.") + + for i, row in enumerate(payload["rows"]): + if len(row) != len(payload["columns"]): + raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.") + + df = pd.DataFrame(payload["rows"], columns=payload["columns"]) + + if "title" in payload: + df.attrs["title"] = payload["title"] + + return df + + except Exception as e: + raise PDFReadError(f"Gagal konversi payload ke DataFrame: {e}", code=400) diff --git a/services/upload_file/read_shp/reader_shp.py b/services/upload_file/readers/reader_shp.py similarity index 100% rename from services/upload_file/read_shp/reader_shp.py rename to services/upload_file/readers/reader_shp.py diff --git a/services/upload_file/upload.py b/services/upload_file/upload.py index dbf9fdd..1a77b42 100644 --- a/services/upload_file/upload.py +++ b/services/upload_file/upload.py @@ -1,27 +1,30 @@ +import json import os import pandas as pd import geopandas as gpd import numpy as np +import re import zipfile from shapely.geometry.base import BaseGeometry from shapely.geometry import base as shapely_base from fastapi import File, Form, UploadFile, HTTPException -from fastapi.responses import JSONResponse from core.config import UPLOAD_FOLDER, MAX_FILE_MB, VALID_WKT_PREFIXES -from services.upload_file.read_csv.reader_csv import read_csv -from services.upload_file.read_shp.reader_shp import read_shp -from services.upload_file.read_gdb.reader_gdb import read_gdb -from services.upload_file.read_mpk.reader_mpk import read_mpk -from services.upload_file.read_pdf.reader_pdf import convert_df, read_pdf -from services.upload_file.geom_detector.geometry_detector import detect_and_build_geometry -from services.upload_file.geom_detector.geometry_detector import attach_polygon_geometry_auto +from services.upload_file.readers.reader_csv import read_csv +from services.upload_file.readers.reader_shp import read_shp +from services.upload_file.readers.reader_gdb import read_gdb +from services.upload_file.readers.reader_mpk import read_mpk +from services.upload_file.readers.reader_pdf import convert_df, read_pdf +from services.upload_file.utils.geometry_detector import detect_and_build_geometry +from services.upload_file.utils.geometry_detector import attach_polygon_geometry_auto from database.connection import engine from database.models import Base from pydantic import BaseModel from typing import List, Optional from shapely import wkt from sqlalchemy import text -Base.metadata.create_all(bind=engine) +from datetime import datetime +from response import successRes, errorRes +# Base.metadata.create_all(bind=engine) def is_geom_empty(g): @@ -78,7 +81,7 @@ def process_data(df: pd.DataFrame, ext: str): print(f"[INFO] Tipe Geometry: {geom_type}") print(f"[INFO] Jumlah geometry kosong: {null_geom}") else: - response = { + res = { "message": "Tidak menemukan tabel yang relevan.", "file_type": ext, "rows": 0, @@ -91,7 +94,7 @@ def process_data(df: pd.DataFrame, ext: str): "preview": [] } - return JSONResponse(content=response) + return errorRes(message="Tidak berhasil mencocokan geometry pada tabel." ,details=res, status_code=422) result = result.replace([pd.NA, float('inf'), float('-inf')], None) if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns: @@ -116,7 +119,8 @@ def process_data(df: pd.DataFrame, ext: str): else: warning_examples = [] - preview_data = result.to_dict(orient="records") + preview_data = result.head(10).to_dict(orient="records") + # preview_data = result.to_dict(orient="records") preview_safe = [ {k: safe_json(v) for k, v in row.items()} for row in preview_data @@ -139,7 +143,7 @@ def process_data(df: pd.DataFrame, ext: str): "preview": preview_safe } - # return JSONResponse(content=response) + # return successRes(content=response) return response @@ -157,7 +161,7 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] = contents = await file.read() size_mb = len(contents) / (1024*1024) if size_mb > MAX_FILE_MB: - raise HTTPException(status_code=413, detail="Ukuran File Terlalu Besar") + raise errorRes(status_code=413, message="Ukuran File Terlalu Besar") tmp_path = UPLOAD_FOLDER / fname with open(tmp_path, "wb") as f: f.write(contents) @@ -174,19 +178,19 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] = elif ext == ".pdf": tbl = read_pdf(tmp_path, page) if len(tbl) == 0: - response = { + res = { "message": "Tidak ditemukan tabel valid", - "tables": tbl, + "tables": {}, "file_type": ext } - return JSONResponse(content=response) + return successRes(message="Tidak ditemukan tabel valid", data=res) elif len(tbl) > 1: - response = { + res = { "message": "File berhasil dibaca dan dianalisis.", "tables": tbl, "file_type": ext } - return JSONResponse(content=response) + return successRes(data=res, message="File berhasil dibaca dan dianalisis.") else: df = convert_df(tbl[0]) elif ext == ".zip": @@ -201,25 +205,29 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] = df = read_gdb(str(tmp_path)) else: - raise HTTPException( + raise errorRes( status_code=400, - detail="ZIP file tidak mengandung SHP atau GDB yang valid." + message="ZIP file tidak mengandung SHP atau GDB yang valid." ) else: - raise HTTPException(status_code=400, detail="Unsupported file type") + raise errorRes(status_code=400, message="Unsupported file type") if df is None or (hasattr(df, "empty") and df.empty): - return JSONResponse({"error": "No valid table detected"}, status_code=400) + return successRes(message="File berhasil dibaca, Tetapi tidak ditemukan tabel valid") res = process_data(df, ext) tmp_path.unlink(missing_ok=True) - return JSONResponse(content=res) + return successRes(data=res) except Exception as e: print(f"[ERROR] {e}") - return JSONResponse({"error": str(e)}, status_code=500) + return errorRes( + message="Internal Server Error", + details=str(e), + status_code=500 + ) # finally: # db_session.close() @@ -237,16 +245,15 @@ async def handle_process_pdf(payload: PdfRequest): try: df = convert_df(payload.model_dump()) if df is None or (hasattr(df, "empty") and df.empty): - return JSONResponse({"error": "No valid table detected"}, status_code=400) + return errorRes(message="Tidak ada tabel") res = process_data(df, '.pdf') - - return JSONResponse(content=res) + return successRes(data=res) except Exception as e: print(f"[ERROR] {e}") - return JSONResponse({"error": str(e)}, status_code=500) + return errorRes(message="Internal Server Error", details= str(e), status_code=500) # finally: # db_session.close() @@ -263,39 +270,351 @@ class UploadRequest(BaseModel): rows: List[dict] columns: List[str] -def handle_to_postgis(payload: UploadRequest): - try: - table_name = payload.title.lower().replace(" ", "_").replace("-","_") +# import time +# def handle_to_postgis(payload: UploadRequest): +# # time.sleep(2) +# # return { +# # "table_name": 'test', +# # "status": "success", +# # "message": f"Tabel test berhasil diunggah ke PostGIS.", +# # "total_rows": 999, +# # "geometry_type": 'POINT' +# # } +# try: +# table_name = payload.title.lower().replace(" ", "_").replace("-","_") +# df = pd.DataFrame(payload.rows) +# print(f"[INFO] Diterima {len(df)} baris data dari frontend.") + +# if "geometry" in df.columns: +# df["geometry"] = df["geometry"].apply( +# lambda g: wkt.loads(g) if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) else None +# ) +# gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") +# else: +# raise HTTPException(status_code=400, detail="Kolom geometry tidak ditemukan dalam data.") + +# with engine.begin() as conn: +# conn.execute(text(f"DROP TABLE IF EXISTS {table_name}")) + +# gdf.to_postgis(table_name, engine, if_exists="replace", index=False) + +# with engine.begin() as conn: +# conn.execute(text(f'ALTER TABLE "{table_name}" ADD COLUMN _id SERIAL PRIMARY KEY;')) + +# print(f"[INFO] Tabel '{table_name}' berhasil dibuat di PostGIS ({len(gdf)} baris).") + +# return { +# "table_name": table_name, +# "status": "success", +# "message": f"Tabel '{table_name}' berhasil diunggah ke PostGIS.", +# "total_rows": len(gdf), +# "geometry_type": list(gdf.geom_type.unique()) +# } + +# except Exception as e: +# print(f"[ERROR] Gagal upload ke PostGIS: {e}") +# raise HTTPException(status_code=500, detail=str(e)) + + + + + + + + +# VALID_WKT_PREFIXES = ("POINT", "LINESTRING", "POLYGON", "MULTIPOLYGON", "MULTILINESTRING") + +# def handle_to_postgis(payload: UploadRequest, user_id: int = 2): +# try: +# table_name = "test_partition" +# df = pd.DataFrame(payload.rows) +# print(f"[INFO] Diterima {len(df)} baris data dari frontend.") + +# if "geometry" not in df.columns: +# raise HTTPException(status_code=400, detail="Kolom geometry tidak ditemukan dalam data.") + +# df["geometry"] = df["geometry"].apply( +# lambda g: wkt.loads(g) if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) else None +# ) + +# gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") + +# insert_count = 0 +# with engine.begin() as conn: + +# # 💡 Tambahkan blok auto-create partisi di sini +# conn.execute(text(f""" +# DO $$ +# BEGIN +# IF NOT EXISTS ( +# SELECT 1 FROM pg_tables WHERE tablename = 'test_partition_user_{user_id}' +# ) THEN +# EXECUTE format(' +# CREATE TABLE test_partition_user_%s +# PARTITION OF test_partition +# FOR VALUES IN (%s); +# ', {user_id}, {user_id}); +# EXECUTE format('CREATE INDEX ON test_partition_user_%s USING GIST (geom);', {user_id}); +# EXECUTE format('CREATE INDEX ON test_partition_user_%s USING GIN (properties);', {user_id}); +# END IF; +# END +# $$; +# """)) + +# # Lanjut insert data seperti biasa +# for _, row in gdf.iterrows(): +# geom_wkt = row["geometry"].wkt if row["geometry"] is not None else None +# properties = row.drop(labels=["geometry"]).to_dict() + +# conn.execute( +# text(""" +# INSERT INTO test_partition (user_id, geom, properties, created_at) +# VALUES (:user_id, ST_Force2D(ST_GeomFromText(:geom, 4326)), CAST(:properties AS jsonb), :created_at) +# """), +# { +# "user_id": user_id, +# "geom": geom_wkt, +# "properties": json.dumps(properties), +# "created_at": datetime.utcnow() +# } +# ) +# insert_count += 1 + +# print(f"[INFO] Berhasil memasukkan {insert_count} baris ke partisi user_id={user_id}.") + +# return { +# "table_name": table_name, +# "user_id": user_id, +# "status": "success", +# "message": f"Data berhasil ditambahkan ke partisi user_id={user_id}.", +# "total_rows": insert_count, +# "geometry_type": list(gdf.geom_type.unique()) +# } + +# except Exception as e: +# print(f"[ERROR] Gagal upload ke PostGIS partition: {e}") +# raise HTTPException(status_code=500, detail=str(e)) + + + + +# Daftar prefix WKT yang valid +VALID_WKT_PREFIXES = ("POINT", "LINESTRING", "POLYGON", "MULTIPOLYGON", "MULTILINESTRING") + + +def slugify(value: str) -> str: + """Mengubah judul dataset jadi nama aman untuk VIEW""" + return re.sub(r'[^a-zA-Z0-9]+', '_', value.lower()).strip('_') + + +# async def create_dataset_view_from_metadata(conn, metadata_id: int, user_id: int, title: str): +# """Membuat VIEW PostgreSQL berdasarkan metadata dataset dan registrasi geometry untuk QGIS.""" +# norm_title = slugify(title) +# view_name = f"v_user_{user_id}_{norm_title}" +# base_table = f"test_partition_user_{user_id}" + +# # 1️⃣ Hapus view lama jika ada +# drop_query = text(f"DROP VIEW IF EXISTS {view_name} CASCADE;") +# await conn.execute(drop_query) + +# # 2️⃣ Buat view baru +# create_view_query = text(f""" +# CREATE OR REPLACE VIEW {view_name} AS +# SELECT p.*, m.title, m.year, m.description +# FROM {base_table} p +# JOIN dataset_metadata m ON m.id = p.metadata_id +# WHERE p.metadata_id = {metadata_id}; +# """) +# await conn.execute(create_view_query) + +# # 3️⃣ Daftarkan geometry column agar QGIS mengenali layer ini +# # (gunakan Populate_Geometry_Columns jika PostGIS >= 3) +# populate_query = text(f"SELECT Populate_Geometry_Columns('{view_name}'::regclass);") +# await conn.execute(populate_query) + +# print(f"[INFO] VIEW {view_name} berhasil dibuat dan geometry terdaftar.") + + +async def create_dataset_view_from_metadata(conn, metadata_id: int, user_id: int, title: str): + """Buat VIEW dinamis sesuai struktur atribut JSON pada dataset (hindari duplikasi nama kolom).""" + norm_title = slugify(title) + view_name = f"v_user_{user_id}_{norm_title}" + base_table = f"test_partition_user_{user_id}" + + # Ambil daftar field dari metadata + result = await conn.execute(text("SELECT fields FROM dataset_metadata WHERE id=:mid"), {"mid": metadata_id}) + fields_json = result.scalar_one_or_none() + + # --- daftar kolom bawaan dari tabel utama + base_columns = {"id", "user_id", "metadata_id", "geom"} + + columns_sql = "" + field_list = [] + + if fields_json: + try: + # handle jika data sudah berupa list atau string JSON + if isinstance(fields_json, str): + field_list = json.loads(fields_json) + elif isinstance(fields_json, list): + field_list = fields_json + else: + raise ValueError(f"Tipe data fields_json tidak dikenali: {type(fields_json)}") + + for f in field_list: + safe_col = slugify(f) + # Hindari duplikat nama dengan kolom utama + if safe_col in base_columns: + alias_name = f"attr_{safe_col}" + else: + alias_name = safe_col + + columns_sql += f", p.attributes->>'{f}' AS {alias_name}" + + except Exception as e: + print(f"[WARN] Gagal parse field list metadata: {e}") + + # 1️⃣ Drop view lama + await conn.execute(text(f"DROP VIEW IF EXISTS {view_name} CASCADE;")) + + # 2️⃣ Buat view baru dinamis + create_view_query = f""" + CREATE OR REPLACE VIEW {view_name} AS + SELECT p.id, p.user_id, p.metadata_id, p.geom + {columns_sql}, + m.title, m.year, m.description + FROM {base_table} p + JOIN dataset_metadata m ON m.id = p.metadata_id + WHERE p.metadata_id = {metadata_id}; + """ + await conn.execute(text(create_view_query)) + + # 3️⃣ Register geometry untuk QGIS + await conn.execute(text(f"SELECT Populate_Geometry_Columns('{view_name}'::regclass);")) + + print(f"[INFO] VIEW {view_name} berhasil dibuat (kolom dinamis: {field_list if field_list else '(none)'}).") + + +async def handle_to_postgis(payload, engine, user_id: int = 3): + """ + Menangani upload data spasial ke PostGIS (dengan partition per user). + - Jika partisi belum ada, akan dibuat otomatis + - Metadata dataset disimpan di tabel dataset_metadata + - Data spasial dimasukkan ke tabel partisi (test_partition_user_{id}) + - VIEW otomatis dibuat untuk QGIS + """ + + try: df = pd.DataFrame(payload.rows) print(f"[INFO] Diterima {len(df)} baris data dari frontend.") - if "geometry" in df.columns: - df["geometry"] = df["geometry"].apply( - lambda g: wkt.loads(g) if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) else None + # --- Validasi kolom geometry --- + if "geometry" not in df.columns: + raise errorRes(status_code=400, message="Kolom 'geometry' tidak ditemukan dalam data.") + + # --- Parsing geometry ke objek shapely --- + df["geometry"] = df["geometry"].apply( + lambda g: wkt.loads(g) + if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) + else None + ) + + # --- Buat GeoDataFrame --- + gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") + + # --- Metadata info dari payload --- + # dataset_title = getattr(payload, "dataset_title", None) + # dataset_year = getattr(payload, "dataset_year", None) + # dataset_desc = getattr(payload, "dataset_description", None) + dataset_title = "longsor 2020" + dataset_year = 2020 + dataset_desc = "test metadata" + + if not dataset_title: + raise errorRes(status_code=400, detail="Field 'dataset_title' wajib ada untuk metadata.") + + async with engine.begin() as conn: + fields = [col for col in df.columns if col != "geometry"] + # 💾 1️⃣ Simpan Metadata Dataset + print("[INFO] Menyimpan metadata dataset...") + result = await conn.execute( + text(""" + INSERT INTO dataset_metadata (user_id, title, year, description, fields, created_at) + VALUES (:user_id, :title, :year, :desc, :fields, :created_at) + RETURNING id; + """), + { + "user_id": user_id, + "title": dataset_title, + "year": dataset_year, + "desc": dataset_desc, + "fields": json.dumps(fields), + "created_at": datetime.utcnow(), + }, ) - gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") - else: - raise HTTPException(status_code=400, detail="Kolom geometry tidak ditemukan dalam data.") + metadata_id = result.scalar_one() + print(f"[INFO] Metadata disimpan dengan ID {metadata_id}") - with engine.begin() as conn: - conn.execute(text(f"DROP TABLE IF EXISTS {table_name}")) + # ⚙️ 2️⃣ Auto-create Partisi Jika Belum Ada + print(f"[INFO] Memastikan partisi test_partition_user_{user_id} tersedia...") + await conn.execute( + text(f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_tables WHERE tablename = 'test_partition_user_{user_id}' + ) THEN + EXECUTE format(' + CREATE TABLE test_partition_user_%s + PARTITION OF test_partition + FOR VALUES IN (%s); + ', {user_id}, {user_id}); + EXECUTE format('CREATE INDEX IF NOT EXISTS idx_partition_user_%s_geom ON test_partition_user_%s USING GIST (geom);', {user_id}, {user_id}); + EXECUTE format('CREATE INDEX IF NOT EXISTS idx_partition_user_%s_metadata ON test_partition_user_%s (metadata_id);', {user_id}, {user_id}); + END IF; + END + $$; + """) + ) - gdf.to_postgis(table_name, engine, if_exists="replace", index=False) + # 🧩 3️⃣ Insert Data Spasial ke Partisi + print(f"[INFO] Memasukkan data ke test_partition_user_{user_id} ...") + insert_count = 0 + for _, row in gdf.iterrows(): + geom_wkt = row["geometry"].wkt if row["geometry"] is not None else None + attributes = row.drop(labels=["geometry"]).to_dict() - with engine.begin() as conn: - conn.execute(text(f'ALTER TABLE "{table_name}" ADD COLUMN _id SERIAL PRIMARY KEY;')) + await conn.execute( + text(""" + INSERT INTO test_partition (user_id, metadata_id, geom, attributes, created_at) + VALUES (:user_id, :metadata_id, ST_Force2D(ST_GeomFromText(:geom, 4326)), + CAST(:attr AS jsonb), :created_at); + """), + { + "user_id": user_id, + "metadata_id": metadata_id, + "geom": geom_wkt, + "attr": json.dumps(attributes), + "created_at": datetime.utcnow(), + }, + ) + insert_count += 1 - print(f"[INFO] Tabel '{table_name}' berhasil dibuat di PostGIS ({len(gdf)} baris).") + # 🧩 4️⃣ Membuat VIEW untuk dataset baru di QGIS + await create_dataset_view_from_metadata(conn, metadata_id, user_id, dataset_title) + + print(f"[INFO] ✅ Berhasil memasukkan {insert_count} baris ke partisi user_id={user_id} (metadata_id={metadata_id}).") return { - "table_name": table_name, "status": "success", - "message": f"Tabel '{table_name}' berhasil diunggah ke PostGIS.", - "total_rows": len(gdf), - "geometry_type": list(gdf.geom_type.unique()) + "user_id": user_id, + "metadata_id": metadata_id, + "dataset_title": dataset_title, + "inserted_rows": insert_count, + "geometry_type": list(gdf.geom_type.unique()), } except Exception as e: - print(f"[ERROR] Gagal upload ke PostGIS: {e}") - raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file + print(f"[ERROR] Gagal upload ke PostGIS partition: {e}") + raise errorRes(status_code=500, message="Gagal upload ke PostGIS partition", details=str(e)) \ No newline at end of file diff --git a/services/upload_file/upload_exceptions.py b/services/upload_file/upload_exceptions.py new file mode 100644 index 0000000..ae496ce --- /dev/null +++ b/services/upload_file/upload_exceptions.py @@ -0,0 +1,9 @@ +class PDFReadError(Exception): + """Exception khusus untuk kesalahan saat membaca file PDF.""" + def __init__(self, message: str, code: int = 400): + super().__init__(message) + self.message = message + self.code = code + + def to_dict(self): + return {"error": self.message, "code": self.code} diff --git a/services/upload_file/utils/__pycache__/geometry_detector.cpython-39.pyc b/services/upload_file/utils/__pycache__/geometry_detector.cpython-39.pyc new file mode 100644 index 0000000..5a212a8 Binary files /dev/null and b/services/upload_file/utils/__pycache__/geometry_detector.cpython-39.pyc differ diff --git a/services/upload_file/utils/__pycache__/pdf_cleaner.cpython-39.pyc b/services/upload_file/utils/__pycache__/pdf_cleaner.cpython-39.pyc new file mode 100644 index 0000000..e232581 Binary files /dev/null and b/services/upload_file/utils/__pycache__/pdf_cleaner.cpython-39.pyc differ diff --git a/services/upload_file/geom_detector/geometry_detector.py b/services/upload_file/utils/geometry_detector.py similarity index 100% rename from services/upload_file/geom_detector/geometry_detector.py rename to services/upload_file/utils/geometry_detector.py diff --git a/services/upload_file/utils/pdf_cleaner.py b/services/upload_file/utils/pdf_cleaner.py new file mode 100644 index 0000000..86d2dd1 --- /dev/null +++ b/services/upload_file/utils/pdf_cleaner.py @@ -0,0 +1,159 @@ +import re +import itertools + +geo_admin_keywords = [ + 'lat', 'lon', 'long', 'latitude', 'longitude', 'koordinat', 'geometry', 'geometri', + 'desa', 'kelurahan', 'kel', 'kecamatan', 'kabupaten', 'kab', 'kota', 'provinsi', + 'lokasi', 'region', 'area', 'zone', 'boundary', 'batas' +] + +def normalize_text(text): + text = text.lower() + text = re.sub(r'[^a-z0-9/ ]+', ' ', text) + text = re.sub(r'\s+', ' ', text).strip() + return text + +def generate_combined_patterns(keywords): + combos = list(itertools.combinations(keywords, 2)) + patterns = [] + for a, b in combos: + patterns.append(rf'{a}\s*/\s*{b}') + patterns.append(rf'{b}\s*/\s*{a}') + return patterns + +combined_patterns = generate_combined_patterns(geo_admin_keywords) + +def contains_geo_admin_keywords(text): + text_clean = normalize_text(text) + if len(text_clean) < 3: + return False + + for pattern in combined_patterns: + if re.search(pattern, text_clean): + return True + + for kw in geo_admin_keywords: + if re.search(rf'(^|[\s/_-]){kw}([\s/_-]|$)', text_clean): + return True + + return False + +def filter_geo_admin_column(tables): + filtered = [] + for table in tables: + found = any(contains_geo_admin_keywords(col) for col in table['columns']) + if found: + filtered.append(table) + return filtered + + +NUMBER_HEADER_KEYWORDS = [ + "no","no.","nomor","nomor urut","no urut","No","Nomor","No Urut","Index", + "ID","Sr No","S/N","SN","Sl No" +] + +def has_number_header(header): + header_text = header + return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS) + +def is_numbering_column(col_values): + numeric_like = 0 + total = 0 + for v in col_values: + if not v or not isinstance(v, str): + continue + total += 1 + if re.fullmatch(r"0*\d{1,3}", v.strip()): + numeric_like += 1 + return total > 0 and (numeric_like / total) > 0.6 + +def is_numeric_value(v): + if v is None: + return False + if isinstance(v, (int, float)): + return True + if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()): + return True + return False + +def cleaning_column(headers, bodies): + cleaned_bodies = [] + + for header, body in zip(headers, bodies): + if not body: + cleaned_bodies.append(body) + continue + + header_has_number = has_number_header(header) + first_col = [row[0] for row in body if row and len(row) > 0] + first_col_is_numbering = is_numbering_column(first_col) + + if not header_has_number and first_col_is_numbering: + new_body = [] + for row in body: + if not row: + continue + first_val = row[0] + if is_numeric_value(first_val) and len(row) > 1: + new_body.append(row[1:]) + else: + new_body.append(row) + body = new_body + + header_len = len(headers) + filtered_body = [row for row in body if len(row) == header_len] + + cleaned_bodies.append(filtered_body) + + return cleaned_bodies + +def parse_page_selection(selectedPage: str, total_pages: int): + if not selectedPage: + return list(range(1, total_pages + 1)) + + pages = set() + parts = re.split(r'[,\s]+', selectedPage.strip()) + + for part in parts: + if '-' in part: + try: + start, end = map(int, part.split('-')) + pages.update(range(start, end + 1)) + except ValueError: + continue + else: + try: + pages.add(int(part)) + except ValueError: + continue + + valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages] + return valid_pages + +def is_number(s): + if s is None: + return False + s = str(s).strip().replace(',', '').replace('.', '') + return s.isdigit() + +def row_ratio(row): + non_empty = [c for c in row if c not in (None, '', ' ')] + if not non_empty: + return 0 + num_count = sum(is_number(c) for c in non_empty) + return num_count / len(non_empty) + +def has_mixed_text_and_numbers(row): + non_empty = [c for c in row if c not in (None, '', ' ')] + has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty) + has_num = any(is_number(c) for c in non_empty) + return has_text and has_num + +def is_short_text_row(row): + """Deteksi baris teks pendek (1-2 kolom teks pendek).""" + non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')] + if not non_empty: + return False + text_only = all(not is_number(c) for c in non_empty) + joined = " ".join(non_empty) + return text_only and len(non_empty) <= 2 and len(joined) < 20 \ No newline at end of file diff --git a/utils/logger_config.py b/utils/logger_config.py new file mode 100644 index 0000000..4d2803a --- /dev/null +++ b/utils/logger_config.py @@ -0,0 +1,32 @@ +import logging +import os + +LOG_DIR = "logs" +os.makedirs(LOG_DIR, exist_ok=True) + +def setup_logger(name: str): + """ + Konfigurasi logger standar untuk seluruh service. + Format log: + [LEVEL] [Nama Modul] Pesan + """ + logger = logging.getLogger(name) + logger.setLevel(logging.INFO) + + # Handler untuk menulis ke file + file_handler = logging.FileHandler(os.path.join(LOG_DIR, "app.log")) + file_handler.setLevel(logging.INFO) + + # Handler untuk console (stdout) + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + + formatter = logging.Formatter('[%(levelname)s] [%(name)s] %(message)s') + file_handler.setFormatter(formatter) + console_handler.setFormatter(formatter) + + if not logger.handlers: + logger.addHandler(file_handler) + logger.addHandler(console_handler) + + return logger diff --git a/utils/qgis_init.py b/utils/qgis_init.py new file mode 100644 index 0000000..9c27cb4 --- /dev/null +++ b/utils/qgis_init.py @@ -0,0 +1,30 @@ +# utils/qgis_init.py +import os +import sys + +# Lokasi instalasi QGIS di Linux (Ubuntu / Debian) +QGIS_PREFIX = "/usr" + +# Path modul Python QGIS +sys.path.append("/usr/share/qgis/python") + +# Environment variable agar QGIS dapat berjalan headless (tanpa GUI) +os.environ["QGIS_PREFIX_PATH"] = QGIS_PREFIX +os.environ["QT_QPA_PLATFORM"] = "offscreen" + +from qgis.core import QgsApplication +from qgis.analysis import QgsNativeAlgorithms +import processing +from processing.core.Processing import Processing + + +def init_qgis(): + qgs = QgsApplication([], False) + qgs.initQgis() + + # Register QGIS processing provider + Processing.initialize() + QgsApplication.processingRegistry().addProvider(QgsNativeAlgorithms()) + + print("QGIS initialized successfully") + return qgs