From f4113e7f0dffceca3675007dd30c424af331d677 Mon Sep 17 00:00:00 2001 From: DmsAnhr Date: Mon, 17 Nov 2025 10:53:15 +0700 Subject: [PATCH] testing qgis server --- .../auth_dependency.cpython-39.pyc | Bin 0 -> 1245 bytes .../role_dependency.cpython-39.pyc | Bin 0 -> 961 bytes api/deps/auth_dependency.py | 34 ++ api/deps/role_dependency.py | 20 + .../__pycache__/auth_router.cpython-39.pyc | Bin 0 -> 813 bytes .../__pycache__/system_router.cpython-39.pyc | Bin 0 -> 777 bytes .../upload_file_router.cpython-39.pyc | Bin 0 -> 1605 bytes api/routers/auth_router.py | 14 + api/routers/datasets_router.py | 20 + .../router.py => api/routers/system_router.py | 0 {routes => api/routers}/upload_file_router.py | 10 +- database/connection.py | 7 +- database/models.py | 28 +- database/uploader.py | 16 - init_db.py | 3 - main.py | 40 +- response.py | 22 + routes/auth_router.py | 15 - services/.DS_Store | Bin 0 -> 6148 bytes services/auth/auth.py | 0 services/auth/login.py | 49 ++ services/datasets/delete.py | 33 ++ services/upload_file/.DS_Store | Bin 0 -> 6148 bytes services/upload_file/read_csv/reader_csv.py | 283 ------------ .../upload_file/read_pdf/filter_column.py | 47 -- services/upload_file/read_pdf/reader_pdf.py | 270 ----------- .../__pycache__/reader_csv.cpython-39.pyc | Bin 0 -> 4085 bytes .../__pycache__/reader_gdb.cpython-39.pyc | Bin 0 -> 1862 bytes .../__pycache__/reader_mpk.cpython-39.pyc | Bin 0 -> 2263 bytes .../__pycache__/reader_pdf.cpython-39.pyc | Bin 0 -> 5956 bytes .../__pycache__/reader_shp.cpython-39.pyc | Bin 0 -> 1641 bytes services/upload_file/readers/reader_csv.py | 116 +++++ .../{read_gdb => readers}/reader_gdb.py | 0 .../{read_mpk => readers}/reader_mpk.py | 0 services/upload_file/readers/reader_pdf.py | 168 +++++++ .../{read_shp => readers}/reader_shp.py | 0 services/upload_file/upload.py | 419 +++++++++++++++--- services/upload_file/upload_exceptions.py | 9 + .../geometry_detector.cpython-39.pyc | Bin 0 -> 13831 bytes .../__pycache__/pdf_cleaner.cpython-39.pyc | Bin 0 -> 5960 bytes .../geometry_detector.py | 0 services/upload_file/utils/pdf_cleaner.py | 159 +++++++ utils/logger_config.py | 32 ++ utils/qgis_init.py | 30 ++ 44 files changed, 1148 insertions(+), 696 deletions(-) create mode 100644 api/deps/__pycache__/auth_dependency.cpython-39.pyc create mode 100644 api/deps/__pycache__/role_dependency.cpython-39.pyc create mode 100644 api/deps/auth_dependency.py create mode 100644 api/deps/role_dependency.py create mode 100644 api/routers/__pycache__/auth_router.cpython-39.pyc create mode 100644 api/routers/__pycache__/system_router.cpython-39.pyc create mode 100644 api/routers/__pycache__/upload_file_router.cpython-39.pyc create mode 100644 api/routers/auth_router.py create mode 100644 api/routers/datasets_router.py rename routes/router.py => api/routers/system_router.py (100%) rename {routes => api/routers}/upload_file_router.py (63%) delete mode 100644 database/uploader.py delete mode 100644 init_db.py create mode 100644 response.py delete mode 100644 routes/auth_router.py create mode 100644 services/.DS_Store delete mode 100644 services/auth/auth.py create mode 100644 services/auth/login.py create mode 100644 services/datasets/delete.py create mode 100644 services/upload_file/.DS_Store delete mode 100644 services/upload_file/read_csv/reader_csv.py delete mode 100644 services/upload_file/read_pdf/filter_column.py delete mode 100644 services/upload_file/read_pdf/reader_pdf.py create mode 100644 services/upload_file/readers/__pycache__/reader_csv.cpython-39.pyc create mode 100644 services/upload_file/readers/__pycache__/reader_gdb.cpython-39.pyc create mode 100644 services/upload_file/readers/__pycache__/reader_mpk.cpython-39.pyc create mode 100644 services/upload_file/readers/__pycache__/reader_pdf.cpython-39.pyc create mode 100644 services/upload_file/readers/__pycache__/reader_shp.cpython-39.pyc create mode 100644 services/upload_file/readers/reader_csv.py rename services/upload_file/{read_gdb => readers}/reader_gdb.py (100%) rename services/upload_file/{read_mpk => readers}/reader_mpk.py (100%) create mode 100644 services/upload_file/readers/reader_pdf.py rename services/upload_file/{read_shp => readers}/reader_shp.py (100%) create mode 100644 services/upload_file/upload_exceptions.py create mode 100644 services/upload_file/utils/__pycache__/geometry_detector.cpython-39.pyc create mode 100644 services/upload_file/utils/__pycache__/pdf_cleaner.cpython-39.pyc rename services/upload_file/{geom_detector => utils}/geometry_detector.py (100%) create mode 100644 services/upload_file/utils/pdf_cleaner.py create mode 100644 utils/logger_config.py create mode 100644 utils/qgis_init.py diff --git a/api/deps/__pycache__/auth_dependency.cpython-39.pyc b/api/deps/__pycache__/auth_dependency.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43986f34d2afe0e93c8d0a5ed25924988d5352a0 GIT binary patch literal 1245 zcmYjQ&2J+$6t_K}NhWC)Rx4O>$O)87vq*>&LIw7)NKhe`4>3X--0^EW?aVl9d$vuK zN~=w;9JnG5$#EtA3;qjVIqko|Va2o4t!m59Kl^?7eVGr3eS+hozaLb;hJ^g<4tEFM zkpo=%B@#gdtw}?3+OV8acV>0agn8(3P)ALi#~z1u(jF723L8iJd7oYJ~eCW^L#cq^g$Qj%DJg#K3Mxgeb=K4=#O; zWJT0yMK)OXE!7VV6-)#o{GGj|c*{hzr6PWxT(K411ST}mnr>nfoh2)F&bFwt%f?l( z3O4DMT!pJp|K_bEH&(*4o_euDYWkbOmYfsy!t>L&e9TtwaG87wT!ETYe_%nD?|oZg zjoXun#GCdxuPZSz@*G->aG(u0oi0i#z$Oi5m!E*8lh>}2%MX8WVz)}D-)Rt&qs4@` z(wyQ8wLGcXJ6|#jws-nf+jyRd)HI+Z!TMW+vn(A{C50^YuGPk z+hQwY?<)73?A|S>!8+C2XU=!24YX0-xjXT>uc3U9>1U|W)$qA1r+$cJOtal&xU)2l LY5ay!oNxXIRSQDF literal 0 HcmV?d00001 diff --git a/api/deps/__pycache__/role_dependency.cpython-39.pyc b/api/deps/__pycache__/role_dependency.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e10ee0e41642893acc980786dd4b01e81df87301 GIT binary patch literal 961 zcmY*X&1w`u5bmCto!xA*8iPnqGVH-*KxPCzg&3kFqL&zAysXI3OxI>-Gk2 z1Loq%qxt{=U%*H373S*6R|p7J%`Rk9P&Hl6RMq!=)uPwyGBo}3PV(Dg><=aLW@ECA zt~W49CV9#-KH!-(usG#bYG=;CY0h@q%G`m=8MLMIoy(SV&zwO=wqG&7^B3!xZ`JJ` zC?S`+cC`_v(mt=3kHAE6r4-~Qsx&CP>wr>)Iskpa>z;ylRiZ#fsz@Q`)M;i2nK|eL zQ?Ft$%l6P|2vKpvlX!zzzWQskZ)3FASo zD+L-8@MfqyLgkHw#yVX>oD7rLBQQbJbN9W-$`tS~aReL96^+_fMP)z*NN>F#3YjH& z|Nhn71oY!KGd0#{6dQ)?{pr8H|JHAsU879&UG%R+`5uN-HfHDS*j9bRjWxE3_Bs2; zr2WH|&KLV5KV>LDYj4tf633u5LdX-4le^c_4R3})H_W`XOTOmY_2P6*QCvu<+ZmQ4 z$J&)(M3VYW?T`$rg*X%~pEi@^^%Ka;%S>IiMO@`Eb49h7#!f( z6A!^#_zD+ZffHkIL7iwOGj_)F%{PviOvVJ|=+|EJof7iX2M=Y0!2znVXe5!eB^|A3 z$0|mB%-X1nt2p>kJL-}u34YwBT~=jXUgeYk-H>Yhn#hq%Zi!4F-bB?7jK3B#{YHU7 zzM>02Qb!Ug8|)(4qUbp#(Y#-9?wQDQD?#7&VN)D~}3(l_x(cbSw z#%lM7bJ6$83X6yh=XXR;5k)Y6UX*8ex+yzRmnT|XL2XSrtXd@`S5|c)sT8ZG)PWaM zioss;Z7^T0b%sZIH8Gko&1hKpoRoo=JLXb--v>ip7=H9myzi5KpqhO&mfVmlipp;2 zCOX2|zahv6J)T86!F`>3<1drm^X7jS`Z0!X6e(C%^&moNm&d7Ck5doA>2uv8asu{8%YBrhObcKF>m8jcZ+F*dY{l0P6FzWP(QeaNvMxa z^W=1JCMwNjpO}EAn(3S~)3lkgLZ{HvhMw`8=VznAtKq_rl}>e~GCSZlEI|*}Pgv?0 zl13Z6MJu#Knovy17yJON@Dgtv>TL0fP}dS#lCOyR-v|xXM7y*}TWdnwYZnN1hSxsK zf!iDBqLOOSi|9eb52ANr^x`Obc@&KjUsYMgl&T&==ZMJ*Cc|8XvMS5GoQ8@Eq0ftX z_-p++=~c5rq;#BCTJTioqFlJ)+riU5fiZA2&u&D6IzCHUltKTTvr>aEK1^0{Bx$qq zBA249>#p&3Rv4cHi*(A23kIgfUR`HE0aVsiLXL&5^|EXd%SDNwh z4j#_trfZclIo{E>{;K*F==JE;c~Oz@ZsS{3XG2Bhq}@8EshAs&$U4+y!^?f?J) literal 0 HcmV?d00001 diff --git a/api/routers/__pycache__/upload_file_router.cpython-39.pyc b/api/routers/__pycache__/upload_file_router.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c212b54feb6a436370a7500f481954c4d98d7356 GIT binary patch literal 1605 zcmbVM&5j&35Vregy61OiXZB|{2_WFqhs^K*D-;k4M1p0NXfM84P202SsQ+SnK(pH0 zA0oDcd!6)%R6(Ivx)QjPbj9{wg5k4_xez9xR@~ z)PDg8B4|NM%4q2^uk@K;1}rE;7E(O(ie4G9$k~1omwnbR2W;ThK`|^xY*dcfxSX(w z+Y5_>a>}O8?iGjSjLj&KbVt}+&VM};(Qj0GBHmHyi~c>XeHqAsJU$~Wiosi748`b8 zxTEakTO!6{@`i|sjCMXdb@v>=J*Q&&mWo603p&47^FjjFs@sEwk>nbf0-_Bm9l_(_NxIgnX-gm#(s5+Bc^G2-gVdrOR-qhM$=Xm!>$=91) zNv`Tb!n!A`>%5XT;R^3rT5{*x08JJ&>9u5`ekrBNd`zSpXoKS9cbIwxV8|`GrMKSB ze+FuPM_!OSdXZ3t1)`<^cSOzM=iSrQm6l4c$~0SDs`{18j9ztV)TS;|%yE@A`ARvm z{yD^eAYvqN*=+4_FZCf@tUUnYt7E_n&HaJlK7py10Fp2&2=lIpgmMt{oXAl6P!fUg zoEUFg`++uc;v|}YffDWLy*vtU27!<}^$7yX(}H^)i9Chb66^gBPJ4Ttw*0>5 zbljGGN2|hz)EUA8;3A3C16({rc!cl~!p8_mmkmUonZ1Zz>@Mt2;TWCwNyELH;mXvH zyC(P#QCJ|3C@fVZ%FvznZ>l^sf-{9oeuyu7*cIgaR{IQGJNDG)U|%E;Y_v|bf$CN$ zjlyRY_K^w^P-h!ATai{K&s2oVp4m2eb=_7;A9mP7=%1f+Xz}G<3v_ndZ%$K;mM!3^ zrI3wYVy|#xgwrslQ-^;8F{WyX literal 0 HcmV?d00001 diff --git a/api/routers/auth_router.py b/api/routers/auth_router.py new file mode 100644 index 0000000..8e8140e --- /dev/null +++ b/api/routers/auth_router.py @@ -0,0 +1,14 @@ +from fastapi import APIRouter, Depends +from pydantic import BaseModel +from sqlalchemy.ext.asyncio import AsyncSession +from services.auth.login import loginService, get_db + +router = APIRouter() + +class LoginRequest(BaseModel): + username: str + password: str + +@router.post("/login") +async def login(request: LoginRequest, db: AsyncSession = Depends(get_db)): + return await loginService(request.username, request.password, db) diff --git a/api/routers/datasets_router.py b/api/routers/datasets_router.py new file mode 100644 index 0000000..a88282f --- /dev/null +++ b/api/routers/datasets_router.py @@ -0,0 +1,20 @@ +from fastapi import APIRouter +from core.config import engine +from services.datasets.delete import delete_dataset_from_partition # import fungsi di atas +from response import successRes, errorRes + +router = APIRouter() + +@router.delete("/dataset/{user_id}/{metadata_id}") +async def delete_dataset(user_id: int, metadata_id: int, title: str): + """ + Hapus dataset tertentu (berdasarkan user_id dan metadata_id) + """ + try: + async with engine.begin() as conn: + await delete_dataset_from_partition(conn, user_id, metadata_id, title) + return successRes(message=f"Dataset {title} berhasil dihapus.", data="") + + except Exception as e: + print(f"[ERROR] Gagal hapus dataset: {e}") + raise errorRes(status_code=500, details=str(e), message="Gagal hapus dataset") diff --git a/routes/router.py b/api/routers/system_router.py similarity index 100% rename from routes/router.py rename to api/routers/system_router.py diff --git a/routes/upload_file_router.py b/api/routers/upload_file_router.py similarity index 63% rename from routes/upload_file_router.py rename to api/routers/upload_file_router.py index 9daadb6..dbcf773 100644 --- a/routes/upload_file_router.py +++ b/api/routers/upload_file_router.py @@ -1,13 +1,15 @@ -from fastapi import APIRouter, File, Form, UploadFile +from fastapi import APIRouter, File, Form, UploadFile, Depends from pydantic import BaseModel from typing import List, Optional from services.upload_file.upload import handle_upload_file, handle_process_pdf, handle_to_postgis +from api.deps.role_dependency import require_role +from database.connection import engine router = APIRouter() - @router.post("/file") +# async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form(""), user = Depends(require_role("admin"))): async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form("")): return await handle_upload_file(file, page, sheet) @@ -30,5 +32,5 @@ class UploadRequest(BaseModel): columns: List[str] @router.post("/to-postgis") -def upload_to_postgis(payload: UploadRequest): - return handle_to_postgis(payload) \ No newline at end of file +async def upload_to_postgis(payload: UploadRequest): + return await handle_to_postgis(payload, engine) \ No newline at end of file diff --git a/database/connection.py b/database/connection.py index 6903193..921a694 100644 --- a/database/connection.py +++ b/database/connection.py @@ -1,6 +1,9 @@ from sqlalchemy import create_engine +from sqlalchemy.ext.asyncio import create_async_engine from sqlalchemy.orm import sessionmaker +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker from core.config import POSTGIS_URL -engine = create_engine(POSTGIS_URL, pool_pre_ping=True) -SessionLocal = sessionmaker(bind=engine) +engine = create_async_engine(POSTGIS_URL, pool_pre_ping=True) +# SessionLocal = sessionmaker(bind=engine) +SessionLocal = async_sessionmaker(engine, expire_on_commit=False) diff --git a/database/models.py b/database/models.py index f848160..fa7c787 100644 --- a/database/models.py +++ b/database/models.py @@ -1,4 +1,5 @@ -from sqlalchemy import Column, Integer, String, Text, TIMESTAMP +from sqlalchemy import Column, Integer, String, Text, ForeignKey, DateTime, TIMESTAMP +from sqlalchemy.orm import relationship from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.sql import func @@ -14,3 +15,28 @@ class UploadLog(Base): uploaded_at = Column(TIMESTAMP, server_default=func.now()) status = Column(String) message = Column(Text) + + +class Institution(Base): + __tablename__ = "institutions" + + id = Column(Integer, primary_key=True, index=True) + name = Column(String(100), unique=True, nullable=False) + address = Column(String(200), nullable=True) + + users = relationship("User", back_populates="institution") + + +class User(Base): + __tablename__ = "users" + + id = Column(Integer, primary_key=True, index=True) + username = Column(String(50), unique=True, nullable=False) + password_hash = Column(String(255), nullable=False) + role = Column(String(50), nullable=False, default="user") # <── Added role + institution_id = Column(Integer, ForeignKey("institutions.id"), nullable=True) + active_token = Column(String(255), nullable=True) + token_expired_at = Column(DateTime, nullable=True) + last_login = Column(DateTime, nullable=True) + + institution = relationship("Institution", back_populates="users") \ No newline at end of file diff --git a/database/uploader.py b/database/uploader.py deleted file mode 100644 index da164ac..0000000 --- a/database/uploader.py +++ /dev/null @@ -1,16 +0,0 @@ -import geopandas as gpd -import pandas as pd -from database.connection import engine -from sqlalchemy import text - -def save_dataframe_dynamic(df: pd.DataFrame, table_name: str): - """Save pandas DataFrame to Postgres (non-geo).""" - df.to_sql(table_name, engine, if_exists="replace", index=False, method='multi', chunksize=1000) - -def save_geodataframe(gdf: gpd.GeoDataFrame, table_name: str): - """Save GeoDataFrame to PostGIS (requires geoalchemy/geopandas).""" - # ensure geometry column exists and CRS set - if gdf.crs is None: - gdf = gdf.set_crs("EPSG:4326", allow_override=True) - # geopandas >= 0.10 has to_postgis in some installs; fallback using SQLAlchemy + GeoAlchemy2: - gdf.to_postgis(table_name, engine, if_exists="replace") diff --git a/init_db.py b/init_db.py deleted file mode 100644 index 0702c1c..0000000 --- a/init_db.py +++ /dev/null @@ -1,3 +0,0 @@ -from database.connection import engine -from database.models import Base -Base.metadata.create_all(bind=engine) diff --git a/main.py b/main.py index 799ee8b..50a55e9 100644 --- a/main.py +++ b/main.py @@ -3,9 +3,11 @@ from fastapi.middleware.cors import CORSMiddleware from core.config import API_VERSION, ALLOWED_ORIGINS from database.connection import engine from database.models import Base -from routes.router import router as system_router -from routes.upload_file_router import router as upload_router -from routes.auth_router import router as auth_router +from api.routers.system_router import router as system_router +from api.routers.upload_file_router import router as upload_router +from api.routers.auth_router import router as auth_router +from contextlib import asynccontextmanager +from utils.qgis_init import init_qgis app = FastAPI( title="ETL Geo Upload Service", @@ -21,9 +23,37 @@ app.add_middleware( allow_headers=["*"], ) -Base.metadata.create_all(bind=engine) +# Base.metadata.create_all(bind=engine) + +# qgis setup +@asynccontextmanager +async def lifespan(app: FastAPI): + global qgs + qgs = init_qgis() + print("QGIS initialized") + + yield + + # SHUTDOWN (optional) + print("Shutting down...") + +app = FastAPI(lifespan=lifespan) + +@app.get("/qgis/status") +def qgis_status(): + try: + from qgis.core import Qgis + return { + "qgis_status": "connected", + "qgis_version": Qgis.QGIS_VERSION + } + except Exception as e: + return { + "qgis_status": "error", + "error": str(e) + } # Register routers app.include_router(system_router, tags=["System"]) app.include_router(auth_router, prefix="/auth", tags=["Auth"]) -app.include_router(upload_router, prefix="/upload", tags=["Upload"]) +app.include_router(upload_router, prefix="/upload", tags=["Upload"]) \ No newline at end of file diff --git a/response.py b/response.py new file mode 100644 index 0000000..3804dc3 --- /dev/null +++ b/response.py @@ -0,0 +1,22 @@ +from fastapi import HTTPException +from fastapi.responses import JSONResponse + +def successRes(data=None, message="Success", status_code=200): + return JSONResponse( + status_code=status_code, + content={ + "status": "success", + "message": message, + "data": data, + } + ) + +def errorRes(message="Error", status_code=400, details=None): + return HTTPException( + status_code=status_code, + detail={ + "status": "error", + "message": message, + "details": details + } + ) diff --git a/routes/auth_router.py b/routes/auth_router.py deleted file mode 100644 index 7896e26..0000000 --- a/routes/auth_router.py +++ /dev/null @@ -1,15 +0,0 @@ -from fastapi import APIRouter -from core.config import API_VERSION - -router = APIRouter() - -@router.get("/login") -async def login(): - return {"status": "success"} - - - - - - - diff --git a/services/.DS_Store b/services/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..9a030516c40377590eff6ebbb90728008ea5fc3b GIT binary patch literal 6148 zcmeHK!Ab)$5S_HuZYe?!3OxqA7Ob@j;$^Az2VBvEO5J5wU0gS%+uB1Z>{)-vFY$Yv zNwQ+8J$Ml*Gcb9R$;^hlESUrVL~j!A12h1@K_x8Ju=zn~oODS_) zvJ}mZ|HuICT^%Mc#2r)cegC3hnhk?Q1sL%j43jt;x7#10RIY4nR-LL-cW%8$HTA~+ zc#`$}(KYohluCla_JhkfnhqMm2>zM(k=F|p_ z=4^J@J#5KN_h8Z1c2x<1l)g^&bo zdP@*Wi>}4oAda92lZt3kg?(ZOla79A<6MimL6Z(b&y3%(GYk7d5qfs?OC1ivHOMV9 zzzi%hP&VBL)&H~A@BhUl?lA+*z*;dNDt)it!zJ0;y0kc|wG#CXm4xDQgI_6V=u(Wa eREnFZTF@`aKy)qU2GN7U7XeKJH_X7FGVlri&Q4H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 best_score: -# best_score = score -# header_line_idx = i - -# return header_line_idx - - -# def detect_delimiter(path, sample_size=2048): -# with open(path, 'r', encoding='utf-8', errors='ignore') as f: -# sample = f.read(sample_size) -# sniffer = csv.Sniffer() -# try: -# dialect = sniffer.sniff(sample) -# return dialect.delimiter -# except Exception: -# for delim in [',', ';', '\t', '|']: -# if delim in sample: -# return delim -# return ',' - - -# def read_csv(path: str): -# ext = os.path.splitext(path)[1].lower() # ambil ekstensi file - -# try: -# if ext in ['.csv', '.txt']: -# # === Baca file CSV === -# header_line = detect_header_line(path) -# delimiter = detect_delimiter(path) -# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") - -# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',') - -# elif ext in ['.xlsx', '.xls']: -# # === Baca file Excel === -# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") -# pre_df = pd.read_excel(path, header=0, dtype=str) # baca semua sebagai string -# df = pre_df.copy() -# for col in df.columns: -# if df[col].str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any(): -# df[col] = df[col].str.replace(',', '', regex=False) -# df[col] = pd.to_numeric(df[col], errors='ignore') - -# else: -# raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)") - -# except Exception as e: -# print(f"[WARN] Gagal membaca file ({e}), fallback ke default") -# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',') - -# # Bersihkan kolom dan baris kosong -# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')] -# df.columns = [str(c).strip() for c in df.columns] -# df = df.dropna(how='all') - -# return df - - - - - - - - - - - - -import pandas as pd -import re -import csv -import os - -def detect_header_line(path, max_rows=10): - with open(path, 'r', encoding='utf-8', errors='ignore') as f: - lines = [next(f) for _ in range(max_rows)] - header_line_idx = 0 - best_score = -1 - for i, line in enumerate(lines): - cells = re.split(r'[;,|\t]', line.strip()) - alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1) - digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1) - score = alpha_ratio - digit_ratio - if score > best_score: - best_score = score - header_line_idx = i - return header_line_idx - -def detect_delimiter(path, sample_size=2048): - with open(path, 'r', encoding='utf-8', errors='ignore') as f: - sample = f.read(sample_size) - sniffer = csv.Sniffer() - try: - dialect = sniffer.sniff(sample) - return dialect.delimiter - except Exception: - for delim in [',', ';', '\t', '|']: - if delim in sample: - return delim - return ',' - -# def read_csv(path: str): -# ext = os.path.splitext(path)[1].lower() - -# try: -# if ext in ['.csv']: -# # === Baca file CSV === -# header_line = detect_header_line(path) -# delimiter = detect_delimiter(path) -# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") - -# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',') - -# elif ext in ['.xlsx', '.xls']: -# # === Baca file Excel === -# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") -# xls = pd.ExcelFile(path) - -# print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}") - -# # Evaluasi tiap sheet untuk mencari yang paling relevan -# best_sheet = None -# best_score = -1 -# best_df = None - -# for sheet_name in xls.sheet_names: -# try: -# df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str) -# df = df.dropna(how='all').dropna(axis=1, how='all') - -# if len(df) == 0 or len(df.columns) < 2: -# continue - -# # hitung "skor relevansi" -# text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1) -# row_score = len(df) -# score = (row_score * 0.7) + (text_ratio * 100) - -# if score > best_score: -# best_score = score -# best_sheet = sheet_name -# best_df = df - -# except Exception as e: -# print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}") -# continue - -# if best_df is not None: -# print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}") -# df = best_df -# else: -# raise ValueError("Tidak ada sheet valid yang dapat dibaca.") - -# # Konversi tipe numerik jika ada -# for col in df.columns: -# if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any(): -# df[col] = df[col].astype(str).str.replace(',', '', regex=False) -# df[col] = pd.to_numeric(df[col], errors='ignore') - -# else: -# raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)") - -# except Exception as e: -# print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.") -# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',') - -# # Bersihkan kolom dan baris kosong -# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')] -# df.columns = [str(c).strip() for c in df.columns] -# df = df.dropna(how='all') - -# return df - - - - - -def read_csv(path: str, sheet: str = None): - ext = os.path.splitext(path)[1].lower() - - try: - if ext in ['.csv']: - # === Baca file CSV === - header_line = detect_header_line(path) - delimiter = detect_delimiter(path) - print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") - - df = pd.read_csv( - path, - header=header_line, - sep=delimiter, - encoding='utf-8', - low_memory=False, - thousands=',' - ) - - elif ext in ['.xlsx', '.xls']: - # === Baca file Excel === - print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") - xls = pd.ExcelFile(path) - print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}") - - # === Jika user memberikan nama sheet === - if sheet: - if sheet not in xls.sheet_names: - raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}") - print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'") - df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str) - df = df.dropna(how='all').dropna(axis=1, how='all') - - else: - # === Auto-detect sheet terbaik === - print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...") - best_sheet = None - best_score = -1 - best_df = None - - for sheet_name in xls.sheet_names: - try: - temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str) - temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all') - - if len(temp_df) == 0 or len(temp_df.columns) < 2: - continue - - # hitung skor relevansi - text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1) - row_score = len(temp_df) - score = (row_score * 0.7) + (text_ratio * 100) - - if score > best_score: - best_score = score - best_sheet = sheet_name - best_df = temp_df - - except Exception as e: - print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}") - continue - - if best_df is not None: - print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}") - df = best_df - else: - raise ValueError("Tidak ada sheet valid yang dapat dibaca.") - - # Konversi tipe numerik jika ada - for col in df.columns: - if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any(): - df[col] = df[col].astype(str).str.replace(',', '', regex=False) - df[col] = pd.to_numeric(df[col], errors='ignore') - - else: - raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)") - - except Exception as e: - print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.") - df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',') - - # Bersihkan kolom dan baris kosong - df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')] - df.columns = [str(c).strip() for c in df.columns] - df = df.dropna(how='all') - - return df - diff --git a/services/upload_file/read_pdf/filter_column.py b/services/upload_file/read_pdf/filter_column.py deleted file mode 100644 index b814503..0000000 --- a/services/upload_file/read_pdf/filter_column.py +++ /dev/null @@ -1,47 +0,0 @@ -import re -import itertools - -geo_admin_keywords = [ - 'lat', 'lon', 'long', 'latitude', 'longitude', 'koordinat', 'geometry', 'geometri', - 'desa', 'kelurahan', 'kel', 'kecamatan', 'kabupaten', 'kab', 'kota', 'provinsi', - 'lokasi', 'region', 'area', 'zone', 'boundary', 'batas' -] - -def normalize_text(text): - text = text.lower() - text = re.sub(r'[^a-z0-9/ ]+', ' ', text) - text = re.sub(r'\s+', ' ', text).strip() - return text - -def generate_combined_patterns(keywords): - combos = list(itertools.combinations(keywords, 2)) - patterns = [] - for a, b in combos: - patterns.append(rf'{a}\s*/\s*{b}') - patterns.append(rf'{b}\s*/\s*{a}') - return patterns - -combined_patterns = generate_combined_patterns(geo_admin_keywords) - -def contains_geo_admin_keywords(text): - text_clean = normalize_text(text) - if len(text_clean) < 3: - return False - - for pattern in combined_patterns: - if re.search(pattern, text_clean): - return True - - for kw in geo_admin_keywords: - if re.search(rf'(^|[\s/_-]){kw}([\s/_-]|$)', text_clean): - return True - - return False - -def filter_geo_admin_column(tables): - filtered = [] - for table in tables: - found = any(contains_geo_admin_keywords(col) for col in table['columns']) - if found: - filtered.append(table) - return filtered diff --git a/services/upload_file/read_pdf/reader_pdf.py b/services/upload_file/read_pdf/reader_pdf.py deleted file mode 100644 index 0f31264..0000000 --- a/services/upload_file/read_pdf/reader_pdf.py +++ /dev/null @@ -1,270 +0,0 @@ -import pdfplumber -import re -import pandas as pd -from services.upload_file.read_pdf.filter_column import filter_geo_admin_column - -def is_number(s): - if s is None: - return False - s = str(s).strip().replace(',', '').replace('.', '') - return s.isdigit() - -def row_ratio(row): - non_empty = [c for c in row if c not in (None, '', ' ')] - if not non_empty: - return 0 - num_count = sum(is_number(c) for c in non_empty) - return num_count / len(non_empty) - -def has_mixed_text_and_numbers(row): - non_empty = [c for c in row if c not in (None, '', ' ')] - has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty) - has_num = any(is_number(c) for c in non_empty) - return has_text and has_num - -def is_short_text_row(row): - """Deteksi baris teks pendek (1-2 kolom teks pendek).""" - non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')] - if not non_empty: - return False - text_only = all(not is_number(c) for c in non_empty) - joined = " ".join(non_empty) - return text_only and len(non_empty) <= 2 and len(joined) < 20 - -def detect_header_rows(rows): - if not rows: - return [] - - ratios = [row_ratio(r) for r in rows] - body_start_index = None - - for i in range(1, len(rows)): - row = rows[i] - if has_mixed_text_and_numbers(row): - body_start_index = i - break - if ratios[i] > 0.3: - body_start_index = i - break - if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row): - body_start_index = i - break - if ratios[i - 1] == 0 and ratios[i] > 0: - body_start_index = i - break - - if body_start_index is None: - body_start_index = len(rows) - - potential_headers = rows[:body_start_index] - body_filtered = rows[body_start_index:] - header_filtered = [] - for idx, row in enumerate(potential_headers): - if is_short_text_row(row): - if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0: - header_filtered.append(row) - else: - continue - else: - header_filtered.append(row) - - return header_filtered, body_filtered - - -def merge_multiline_header(header_rows): - final_header = [] - for col in zip(*header_rows): - val = next((v for v in reversed(col) if v and str(v).strip()), '') - val = str(val).replace('\n', ' ').strip() - final_header.append(val) - final_header = [v for v in final_header if v not in ['', None]] - return final_header - - - -NUMBER_HEADER_KEYWORDS = ["no","no.","no .","no . ","no :","no : ","nomor","nomor.","nomor :","nomor urut","no urut","no. urut","no-urut","no_urut","nomor_urut","nomor-urut","No","NO","NO.","No.","No :","NO :","Nomor","NOMOR","Nomor Urut","NOMOR URUT","No Urut","NO URUT","No. Urut","NO. URUT","No /","No / ","No / Nama","No -","No - ","Nomor /","Nomor -","Number","No. of","No of","Index","Serial","Order","ID","ID No","ID No.","Sr No","Sr. No","S/N","SN","Sl No","Sl. No","N0","N0.","N0 :","NOM0R","NOM0R URUT","N0MOR",] - -def has_number_header(header): - """Periksa apakah header mengandung kolom No/Nomor.""" - header_text = header - return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS) - -def is_numbering_column(col_values): - """Periksa apakah kolom pertama diisi nomor urut seperti 1, 01, 2, dst.""" - numeric_like = 0 - total = 0 - for v in col_values: - if not v or not isinstance(v, str): - continue - total += 1 - if re.fullmatch(r"0*\d{1,3}", v.strip()): - numeric_like += 1 - return total > 0 and (numeric_like / total) > 0.6 - -def is_numeric_value(v): - """Cek apakah suatu nilai termasuk angka (int, float, atau string angka).""" - if v is None: - return False - if isinstance(v, (int, float)): - return True - if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()): - return True - return False - -def cleaning_column(headers, bodies): - cleaned_bodies = [] - - for header, body in zip(headers, bodies): - if not body: - cleaned_bodies.append(body) - continue - - header_has_number = has_number_header(header) - first_col = [row[0] for row in body if row and len(row) > 0] - first_col_is_numbering = is_numbering_column(first_col) - - if not header_has_number and first_col_is_numbering: - new_body = [] - for row in body: - if not row: - continue - first_val = row[0] - if is_numeric_value(first_val) and len(row) > 1: - new_body.append(row[1:]) - else: - new_body.append(row) - body = new_body - - header_len = len(headers) - filtered_body = [row for row in body if len(row) == header_len] - - cleaned_bodies.append(filtered_body) - - return cleaned_bodies - - - - -def parse_page_selection(selectedPage: str, total_pages: int): - if not selectedPage: - return list(range(1, total_pages + 1)) - - pages = set() - parts = re.split(r'[,\s]+', selectedPage.strip()) - - for part in parts: - if '-' in part: - try: - start, end = map(int, part.split('-')) - pages.update(range(start, end + 1)) - except ValueError: - continue - else: - try: - pages.add(int(part)) - except ValueError: - continue - - valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages] - return valid_pages - - - -def read_pdf(path: str, page: str): - pdf_path = path - selectedPage = None - # if page == '' or None: - # selectedPage = "1" - if not page: - selectedPage = "1" - else: - selectedPage = page - tables_data = [] - with pdfplumber.open(pdf_path) as pdf: - total_pages = len(pdf.pages) - selected_pages = parse_page_selection(selectedPage, total_pages) - - print(f"[INFO] Total halaman PDF: {total_pages}") - print(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}") - - for page_num in selected_pages: - pdf_page = pdf.pages[page_num - 1] # index pdfplumber mulai dari 0 - tables = pdf_page.find_tables() - print(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi") - - for t in tables: - table = t.extract() - if len(table) > 2: - tables_data.append(table) - - print(f"\nTotal tabel valid: {len(tables_data)}\n") - - header_only = [] - body_only = [] - for tbl in tables_data: - head, body = detect_header_rows(tbl) - header_only.append(head) - body_only.append(body) - - clean_header = [] - for h in header_only: - clean_header.append(merge_multiline_header(h)) - - clean_body=[] - for i, raw_body in enumerate(body_only): - con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body] - cleaned = cleaning_column(clean_header[i], [con_body]) - clean_body.append(cleaned[0]) - - parsed = [] - for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1): - parsed.append({ - "title": str(i), - "columns": cols, - "rows": rows - }) - - clean_parsed = filter_geo_admin_column(parsed) - # print(f"parsed{clean_parsed}") - return clean_parsed - - - - - - -def convert_df(payload): - if "columns" not in payload or "rows" not in payload: - raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.") - - if not isinstance(payload["columns"], list): - raise TypeError("'columns' harus berupa list.") - if not isinstance(payload["rows"], list): - raise TypeError("'rows' harus berupa list.") - - for i, row in enumerate(payload["rows"]): - if len(row) != len(payload["columns"]): - raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.") - - df = pd.DataFrame(payload["rows"], columns=payload["columns"]) - - if "title" in payload: - df.attrs["title"] = payload["title"] - - return df - - - - - - - - -def test_read_pdf(): - # single - # parsed = [{'title': 'Tabel 3.49. Potensi Penduduk Terpapar Bencana Banjir di Provinsi Jawa Timur', 'columns': ['No', 'Kabupaten/Kota', 'Jumlah Penduduk Terpapar (Jiwa)', 'Penduduk Umur Rentan', 'Penduduk Miskin', 'Penduduk Disabilitas', 'Kelas'], 'rows': [['1', 'PACITAN', '111.309', '14.142', '9.307', '781', 'SEDANG'], ['2', 'PONOROGO', '381.579', '50.815', '44.256', '2.346', 'SEDANG'], ['3', 'TRENGGALEK', '284.509', '34.304', '33.653', '1.945', 'SEDANG'], ['4', 'TULUNGAGUNG', '777.174', '86.452', '67.952', '3.200', 'SEDANG'], ['5', 'BLITAR', '226.767', '25.032', '22.554', '909', 'SEDANG'], ['6', 'KEDIRI', '545.961', '59.272', '74.578', '2.539', 'SEDANG'], ['7', 'MALANG', '238.170', '23.646', '25.388', '641', 'SEDANG'], ['8', 'LUMAJANG', '267.926', '30.206', '33.738', '970', 'SEDANG'], ['9', 'JEMBER', '1.061.703', '109.355', '105.958', '2.424', 'SEDANG'], ['10', 'BANYUWANGI', '442.290', '51.294', '44.107', '1.168', 'SEDANG'], ['11', 'BONDOWOSO', '143.452', '18.178', '21.676', '517', 'SEDANG'], ['12', 'SITUBONDO', '233.211', '26.799', '54.221', '928', 'SEDANG'], ['13', 'PROBOLINGGO', '326.005', '37.002', '58.562', '1.323', 'SEDANG'], ['14', 'PASURUAN', '485.143', '49.285', '65.076', '1.576', 'SEDANG'], ['15', 'SIDOARJO', '1.930.615', '172.191', '132.673', '3.987', 'SEDANG'], ['16', 'MOJOKERTO', '498.583', '52.453', '49.831', '1.491', 'SEDANG'], ['17', 'JOMBANG', '876.937', '92.415', '107.447', '4.985', 'SEDANG'], ['18', 'NGANJUK', '829.022', '95.454', '117.127', '3.029', 'SEDANG'], ['19', 'MADIUN', '363.763', '44.997', '44.877', '1.695', 'SEDANG'], ['20', 'MAGETAN', '117.247', '15.706', '11.051', '652', 'SEDANG'], ['21', 'NGAWI', '419.065', '49.864', '65.877', '1.572', 'SEDANG'], ['22', 'BOJONEGORO', '910.377', '100.800', '117.977', '3.557', 'SEDANG'], ['23', 'TUBAN', '507.407', '51.775', '60.834', '2.206', 'SEDANG'], ['24', 'LAMONGAN', '884.503', '99.928', '96.031', '3.960', 'SEDANG'], ['25', 'GRESIK', '613.133', '59.848', '49.854', '1.666', 'SEDANG'], ['26', 'BANGKALAN', '312.149', '31.075', '36.099', '1.169', 'SEDANG'], ['27', 'SAMPANG', '239.656', '28.756', '39.790', '1.280', 'SEDANG'], ['28', 'PAMEKASAN', '216.423', '25.831', '30.296', '776', 'SEDANG'], ['29', 'SUMENEP', '217.805', '24.741', '33.293', '1.088', 'SEDANG'], ['1', 'KOTA KEDIRI', '162.064', '17.129', '13.997', '363', 'SEDANG'], ['2', 'KOTA BLITAR', '21.390', '2.242', '1.185', '79', 'SEDANG'], ['3', 'KOTA MALANG', '148.072', '15.499', '6.142', '201', 'SEDANG'], ['4', 'KOTA PROBOLINGGO', '117.911', '12.708', '10.913', '420', 'SEDANG'], ['5', 'KOTA PASURUAN', '199.602', '20.199', '19.721', '516', 'SEDANG'], ['6', 'KOTA MOJOKERTO', '139.962', '14.486', '6.971', '584', 'SEDANG'], ['7', 'KOTA MADIUN', '149.468', '17.255', '6.300', '304', 'SEDANG'], ['8', 'KOTA SURABAYA', '2.469.639', '244.061', '133.953', '3.838', 'SEDANG'], ['9', 'KOTA BATU', '8.858', '939', '529', '13', 'SEDANG'], ['-', 'Provinsi Jawa Timur', '17.878.850', '1.906.134', '1.853.794', '60.698', 'SEDANG']]}] - - # double - parsed = [{"title":"Luas Catchment Area (km2) Pada Wilayah Sungai di Provinsi Jawa Timur","columns":["Wilayah Sungai","Luas (km2)","Jumlah DAS"],"rows":[["Bengawan Solo","13.070,00","94 DAS"],["Brantas","13.880,00","20 DAS"],["Welang -Rejoso","2.601,00","36 DAS"],["Pekalen -Sampean","3.953,00","56 DAS"],["Baru -Bajulmati","3.675,00","60 DAS"],["Bondoyudo -Bedadung","5.364,00","47 DAS"],["Madura","4.575,00","173 DAS"]]},{"title":"Jumlah dan Kepadatan Penduduk Menurut Kabupaten\/kota di Provinsi Jawa Timur Tahun 2021","columns":["Kabupaten\/Kota","Jumlah Penduduk","Persentase","Kepadatan Penduduk (Jiwa per Km2)"],"rows":[["Bangkalan","1.082.759","2,64","1.081,20"],["Banyuwangi","1.749.773","4,27","302,60"],["Blitar","1.228.292","3,00","919,05"],["Bojonegoro","1.343.895","3,28","611,20"],["Bondowoso","801.541","1,96","525,27"],["Gresik","1.283.961","3,13","1.077,83"],["Jember","2.581.486","6,30","834,80"],["Jombang","1.350.483","3,29","1.211,10"],["Kediri","1.671.821","4,08","1.206,18"],["Lamongan","1.379.731","3,37","774,24"],["Lumajang","1.091.856","2,66","609,67"],["Madiun","754.263","1,84","726,94"],["Magetan","689.369","1,68","1.000,77"],["Malang","2.611.907","6,37","739,78"],["Mojokerto","1.126.540","2,75","1.569,37"],["Nganjuk","1.133.556","2,77","925,92"],["Ngawi","896.768","2,19","691,96"],["Pacitan","597.580","1,46","429,94"],["Pamekasan","840.790","2,05","1.061,28"],["Pasuruan","1.603.754","3,91","1.088,01"],["Ponorogo","968.681","2,36","741,89"],["Probolinggo","1.156.570","2,82","681,86"],["Sampang","902.514","2,20","731,92"],["Sidoarjo","1.951.723","4,76","3.076,58"],["Situbondo","666.245","1,63","398,98"],["Sumenep","1.134.750","2,77","567,79"],["Trenggalek","746.734","1,82","650,91"],["Tuban","1.223.257","2,98","666,93"],["Tulungagung","1.126.679","2,75","1.067,28"],["Kota Batu","215.248","0,53","1.574,14"],["Kota Blitar","158.123","0,39","4.854,87"],["Kota Kediri","292.363","0,71","4.611,40"],["Kota Madiun","201.243","0,49","6.045,15"],["Kota Malang","866.356","2,11","5.963,35"],["Kota Mojokerto","139.961","0,34","8.497,94"],["Kota Pasuruan","210.341","0,51","5.960,36"],["Kota Probolinggo","242.246","0,59","4.274,68"],["Kota Surabaya","2.970.843","7,25","8.475,05"],["Provinsi Jawa Timur","40.994.002","100,00","76.228,17"]]}] - # df = convert_df(parsed, table_index=0) - return parsed \ No newline at end of file diff --git a/services/upload_file/readers/__pycache__/reader_csv.cpython-39.pyc b/services/upload_file/readers/__pycache__/reader_csv.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28912198b0358e0ed202448cb77150f1b39c1ea9 GIT binary patch literal 4085 zcmbtX%WvGq8K2>6xx12fB`dNlKf)%CrA_3O8>B^%2u{)_PE(*tU?n!Pv9qOjxDq8U zNe?MmT97$ZZkqxXqMnlg($Tm60X_5&D0=LH*8&A}E6|%zp#6QbD@9fwmvX=v&g+}` z-aopdqZNkdv;Um)|1!?lKd5v1G0=GfZ}JONf(ah57FVs-(p0Orbk!OyLsW$>jK{oX ziVXp2bu)owqSyUdIEnDzq))+ekJ+^d6r*|~^9zIpHHeMZX@>f(T%W$mF zi1e=U`L*A(-?1(irm!B{DbM&0e~az#T)T_w87MnCXJP$^ELpt4xGEo^7NuMlc5b9v zrqPc3@jWd!b1T*N4N=alee-}FXn85ML`75|v#dl^cX(P#?X0xJ#mJ6!k>xh#N55yj zoY`42Pp<{E&D^59jbElc-i8lKUe$PuXX<bOC26CcMLBr$jR zUp)QN$FI$2_tN{{;Mc;Egx)Kt!0i-qoAQ0wT*eQe!aGpvLrvL|*0(=?^Lop@-oL-} zL@$g7Q4|!K^a?BST-jNDqDiu9}Nqv%RF0PDZ-y-XahKO`Rh z&s%Hv#F8YX%JWo_qQ1VHynyZ)4biO{<-&+!FDy*yhAUoC_QH*xC*9PO&tSSRlQ{5G zG^zCCLQgh&h2C?w3O(?`Mzt_vH(f1kjI?F+C@D-*P135JQdHX)TSeu8m!$2a1FI~! zUl_EkFgsolB(16&#H(%_XZlgAD*P2c9dvLiZ5Ui#$x+F6;+t|*N}a)d8K2;R%T`TCX?OF(qM8H`v~hEc0XdmT4py8 zbKupbB}w5*l9Ni4D&n+Z9P#-h{@M{QAMs3Hr=gO(h^8>enF<}gRFsxOzuWaBq(~n{ zS$Kip^P$+H{PtGIi^(&w?|Ir+P264_cHF#;p z4!GbKm=d+Nr}h7oX+nb-MQUUeuJzPZt$~25XS(3$*%wS0nSt8;%G}|crKL4HW0{^A zluLvy%DYx>?rGt#K&!IJ_B8ot?5?U4%XH!w>?=eD+PPn$KEk^tM$&3#t&QwpR*cdY zRvEM^<9l3AWSqt}x%_)ND#kKeReOpe*8ZABJU+KfUjlnRu7*A_rV%Jb6R2S#O z?D6nb?46=Lg5@P~KDW^#;jE3oy#AF}`upIMXjM*VRkJE+oTXI- zA5|ox@pWyFCtK888%KY7oq>-}@1lM5y+L|mF==UG0Mjb-+_0ZM<%#&R2JtCUsq@?94G98U4fp0+*BSzhC;-vmFG zvkK&NC956F@-Ey0eAPa!(Jj92g_XhR4Y-QBUsT07utwYBrGhUZ}uS9kTb+Wxe70o-0VAd9&D-79Hk zk*sQFutr=Hv%A`0bWeMDTb(;c=bmt_Z;zegxgPZWrZrHQGqrP$UhlZ#K7rWDCcl>0cZk`%AC)!0otB7YMMe75Yf*2YjF|;R<(f#j{7uJ3W9h zSNg;0*oA0U9O(t#rW-b!O*sK-hlI_Zf&qjJC>bo{O$gmmrb_va1|{+(T&3bCewd_g z*uf={uW?Dfbv8`fi;&0z;of{76b86thH{32U#B3|V9`ofJ^y_DP+&Qfbob5MOZT03 z+!Z$f8v|7h1liZ1i|L^V1}Z_SiT%J|g;4urv#^_R1*%KdBkA<@=F8pw+*wg=LKb2u zFc;7Qnjvdw7KA}f>8*HM{n{tjU%w|_n!DFT-FW?~%8LyyDU0`CxfMxZ&VkC0tzq2@ zLD!jEb;E7fAuFGElteh-C6g*M%Ud4QYehzYqOy%Gna3W zJl~`{*q?kDlGTW()OU=vmHMW!PGRjjp-=kN*yjgqfVY*#kQm=3g3nU*Z6L0WiI^r8 zw7m3fDm+HTjmw1=C4)q3D>w%%52O}7!Y=`#B9)2@GnRgs7FsL{`oreytnBOqo}CO9=WdKp|FIE;?ONhohaDog-Kz%aU5*- z+*mq9$xxu<$Mk8riSn_M^y0vU9h*>QXB9pd;?E$B+6tWd9l+11Q`ns-OkLP~V_b13 z7m0>W+*MBjQ28So)Pilq&J2s!em-clzs+22y3UKI>W>F2OI>Bdj0;(ul z1uG}`ly-$%`g5RA#|X{XTAfewIrOHurP*j-1i-zdk6^rx@vGQ1qN!WJAIRX)%&TSz zINyp9c-=&j1uAeaU#743s5&u&f!Prwr@OtAoj^l)n(smEDsxL{Phyy~X1wz!SKqR2R7GzC4<7AWd?q+r! z5^J^xsCq-qfm11xeXP_!q-WIAoRE6zrGG$AeKU4xP!HXe-@G?(-tYb1o0&D8o%Ins ztt++Y*D6B4YvcH1gYg;s@*5x+VmL-2HX{j%8LiMVqaE62bVA3BZs;d2Rb*<(`L#+s4#7>k3VHk#EMMq8+K3U5@`c-PV6i;t|*Ei0ubD`0(GR%La{%YnET z(izQ7QC(Rb8xDIf_P&?S5zc=BEdM`PUO6LY3|39nrdTduy^;PgtlAXo&E9M2A7SyM z$@r5fW^r=ol$@6fZ1y=RTlZ0N z#YV;HBXal^M&%-`IQtGpT`o>ypW^%IA)JA7@!Q43pNy+$70K4ldkhWG5fR&f#)+s4Y{2T@*h(%s?Z zyAh+i0gDRWAMDcP{}RB;fYF%tHM8MIlW59lV6K?qKcO4t6xMFiE?pj;_~!cB)qCrY zgPXjsXC{j;1#K)WU^M(-oo>-M=pXk&77=N@OTntWLR@V-?JS$+ivbq{5=S2sU z_L9(FG1E5X7X3_v>#{n6_rjSz?WXyG`RIi+u=kzGae=x-nik47q;N-J7S{hcGA1Dq z(Rp))dtt2y)Qf4Ts_L>jx~_2UU&QHmwWr$^dV1by?xd=)*4<8_869Inajd5>{~3u@ z;QrU`m3uiC`3jI-*%0Xt?-u#WAd6EdJ}uHd)n%{1lKrU5-+DB?72O~mIvG2VcR4#$ zp6MGH9>C|1KpYkCw90GoUp>GqQRXt!#GB0i7AvBDd$xboE|cG`RIE z~dXx2DD} znVOvLQn{$fmNm6=np?YuK`6npO71M)Y5dQJIk9uQ^x@>r%!dI!ZoTy3?DKK!&Wd)p zxlJEZZb6K8_I1O9?;MkX$C+lH8b#|udMCRBzV3lywYn>vT45xr zuj?ESV&_upl9-pngGl zVQ)aPK-hRjj-seiykvxwDNV(0(4wBEFVMJhZ_t5 zCzrT+44v$U7g?Xs9MN-PV&n#=N6D*vt9GS9q^u!U+Bub%bBd#|piW!d%*hk-)WGbo zk@#baXnXuKB@>g9>=@Xs+WX;}o@YoD5`;6mwyqv|nyXNgx#>>sU3RD(V;R_eF}??O5^+ z)Hu~ZKpDnRP*EbJs*d1*)<9@KVnJNFd<%364a72N15&#{+th*tFF=uRgZ>$nn81Bm z#U0E+DYJS54uO*i)g*`PY#WajF{4wu3%WJ1SnzK{PU7}yqCPqWImQ1bl$@$%ZSXRK z+MHUuorbBZpAbI(6xM*a-8EprS23)T(=cCM z-Wvr6jCWcqg(26ocafdFZ?Aph4HK0B#gs&*OGYYT)h(!mD`-UtT;_KmF?*)Skpkj6 zIOB{bZ5Y>~5)qJHI``eGS2@%|a-o37eZ!)z(WC}ku`XP)6NYQF;NrE7257+o%qmaA=uS)G zi!0xQ3Z4f@d|?Dh6?`2SDP=KdN}dFB&rZjmWU@M)H*vfP4_S30cn+n%JsZSVLzZ;l begT*nh2l$OvXbFl*ozDMEQ{GX)N20+UqU{e literal 0 HcmV?d00001 diff --git a/services/upload_file/readers/__pycache__/reader_pdf.cpython-39.pyc b/services/upload_file/readers/__pycache__/reader_pdf.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c165af49ea8bf308659c7d70507dd4c84b29cbc GIT binary patch literal 5956 zcmai2TW{RP73OfcSFNPg#TUtn%LqxVb?vns$4M%$VKy87F849bbmY5EjMreJ%sL+2?J$8Q8Af0u3)dwBfj+Et_rIZ*tLd zyk^UH2R$#IZn-x19Iu^r)LC4;eDTs1n~UeA^d-Fqb~xxaUBBJ7Ws4Du>935tf$OKZ zf@O?ET2N|hSD{Ox-^SZTZJUjt%aI;JPa%u&zO7%>u6d(eq(!*}ZDfRcWNb5eG0Kf} zrfvQyV$?5;t|5Ps^@X`akC8zXFT)GrUBv2@Y|iFNqzi4J$_KMDCCV8W`_t@(X-R z^`zhaA}a0_BI7P&nx}7mvAOG=o}6e~wBtO4Y<1#1-kpA3mlL2Kv(`N&$w}%d zpKsfqeXB1oEZ|j6qZvN>opNRKC5crp#(BxTHa_gyo+S4X=eV~Omu=wD23l=NXow5E z-?u$cFUJPy$toxy2Ucy}7h6p*54<{_uy4i8iF24@&Gh}y_Ckldse%J*QcX@^YYS^S zReaVF=Nxfszksv%3qy7ioJXTEgBki1u31*XA8)Lx+F8{up)Keb(W|Pz#wyHYRo&35 zx_lVx_iQG~na8D&Z{Y#RA(PcU1m>@;0`DyG0YJA-Xxe5Xx2=zIVJ^yTV~d3$@^|%} z{9O&6fB`o-!p&w?6hsl8L#-IGh`i#rAcl0W>}VMeu%ktqD&b@ZGdt|ACiRG=76f-x zXKS#ES{ZO z_CWhEs48gJh*hE8WYUOIc-I-Ydju2K5`IXi{hSdJpQe7r+41k?Y>Y@)ZEooFV*{2S z#wBUr1mbO>v=JAj-FJD*R%($;pkFuQiCq}CsvC|sw#qn%s1fIGa(5ql@xgCr=y6aC zh;_2QDm%>P(M~bhpfBmM(dS`@+P2*yZDqfQ@IZ#Etm_+K?+zROEkYE*Cf6e7>4J@N zus9>sc37m}WxxAul!v9|Z@{)j1$gjWRDiV=!~9M`=uzQg?NiKRntIPg#q2#Q!dl-M zm1re=xj}JcRF29!Mc8FU&CW-qk2O&M^~yHGQ}NS0QHm;9xjdSPCU#1gTg7-4HeN#9 z`6x1W%28F>^GBO+zp06`fiSt9LtLnx(SpC9hHIn$FL9}FRInN`2Gl1$RXAwFR&;GN zIhvw~ckQv!bXbXyPaq=IMl;bQ=+_h%Q+S%8r#sr_H%TrLO{%Cn3;oQ66Va54Wlv(| z{OEv~qHH5Npk^EltI=#Uzo112+i<|6L(v?l9TL+Enxe=yp8<`-;Y2uzd2`!3FkAb8 zqW2?24cEeGF*>|c4yU5|Z5Ay=i(=*z#s3|)`C&L69mZ}PiH^Jp2pU>gyTi8r$F$KB z(}qXG88I6z(LRVdG5>-78}=sjAuFD0uqs!`OJy4C(kXL11}tOv!&)4NamSm|M}}bo~tMH&l3HMh{4^! zNrVS|;XqhKjsAGy7UC1S`@~4H@CjP@d>5gSkC&1pA@6{~3i%1s-!SQ3PxQH$WJ2X! zB3{5dB+EL~9+-)nP@Gb%B7UH23~vm)Hn_Zge>7UDL+`YWPZGTs_&R11!IYHpv?mpH z;%%&?bh8ucV!%@dZVbf0HM_p+_fjoAMP#3YgL4u-!PZz=dkkS;)(gIpeFtkNVPjsq z!~26Ep>2A8&xeu7L6|U~P1>@90e6W18-t$9JL&XP3yDn0il_I9%LB(XdAH+8$l0}_ zLzs-Y;Y%bk=1W&!UCUI7Z5wpMfgE%9o>CvF5WnTn30-@6)jXfDsPuH<#GWZho*!V+ zTa8BJ+TKA*PqLA0)kr=3h&aBaiUw#9nCmEuPG!5Eq#^f}6VRi%((Mca=&omnT=0;a zs5yc@B<60qOJ`0K@s|n7u2LT=9LI$wfR{m^D_cyofJJ?nbJvkhkMy)5eGiF}FrU5r zVg_K@PBBXwp@3qpKx_3?^A*1fx?=#F3LDA~2#0*gcr&r8ZD*_2)Mlp`Ho^bEl!O-G zD(yUBw!nqoSp|5PPngS(pL~LD36N8+uApR+-Y&)nUvCMg6<%94FY*oKfT;^pvYUW) z3eZ%;*~Vu!m7y-9&z-TaBDr^$Y^&x=(5i`S^ah8A=-O~Kw0&m4VO_n{fhx(URz&Sb z%M)$H?96cStryoWy?V`j%@4Vo2~AU0&EfHM>}QFOOLUt0o79IT%3U>O4H0}ey>G!# z2bGyrcCbi30)FXB3jb@v`AYK91n|f+905X)V(h`0)jUSOt!Ppy3hJgL*GOxE&c4J& zQgZSht|YHoqDn~BLElo_JeY;WjD=Y$b_L54*ra-^>knS}eQDvE$_o=w(yY~J+3x=Q zONsHTu^7j`ON?FB+HSh?2ef=#R;r3=FWICBVpL|Kf~n`?ymCKrF?BJL(zfB;3%m`p z$;?cdjq_*ZaG$f-X!*jf&&HK|(K9xD)Jl>HGAmCw3?$VRaUnTwhz-Zv@Z$+&p*a5F z>#iNdMI^G4w?dV#s?tOrA>NDB9HT~M`c!Mi<>zm;>^@b%vV`~gL-G_&IE}_CfeW0I zgdGbf9CT1(i_2iZnn=FWBb1$CwpSX0^L^u0t}(}_$*5Cvi(WtD|c5^-&1 zNwx2}TUJ@scJvtM!gV(`C|1M<^;%UGn$l9ksw8hTB9AJjC4aZ6CYIoZ)Dt#<&TwoI z7u4AgC}BqewH+C`bb#6Wc9;r+#Nw$&3@)S59!^U`6oM62(QB-RvTzC|;}VsJNZ?QC z2la}2s<0_lQr0kDW3#%!j#3%Pbf$k@EtF7D)|h++H1{BYHh7%g{|7I_zX}`;-DJTm zl65ML7U{TXoBf8-fsJ4bTBvX9!D$==Ax%zuad@M8FVW$U>NPMIy(-~7f%KH249!#d ziB?g;u}w~10qsJRr1I(;n^(fZjbgGknI}pgpi~uQ@S@4Tps$Q1Id`KZ%I9%U)7kRY zzmd!uDw(Z_N@S>Hc0yFg$t*_SH;~M#yGmxM7M)y^PeJAcEoUzCEjp<&Lr3r~p$3^- zmz?&Nc`U`%u_Ou|qhNikFT7VaNG9{ExwdwDX?|qLX$4gfEyUae2W zl{dIMu+?dzoIzJ?P%w%iYq_}xD1vKsHJ4CN{2@+hy$z==>EtyoHJcvP-E2w%Z{i6l+9m#vV5g+nr52?ibrx1S-}8|@ b+ZV7=$-yIGyF^DmIP@v$rIk{p_QwAK#}XPj literal 0 HcmV?d00001 diff --git a/services/upload_file/readers/__pycache__/reader_shp.cpython-39.pyc b/services/upload_file/readers/__pycache__/reader_shp.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d13461c0d4f52354f381db4f87710e2a56a1119 GIT binary patch literal 1641 zcmZWp&2Jk;6rb5$+q)abaZ8~|N?}zYAQw9=Q4dvB@sTtFi4f6`s9Pdy;u+iP>{n*S zZDY-HXt-9*9gx;Xu81>#1Sd3CPW%Hs^u(KW3X#~+zV~L{oA-XdH?vC?7kvaz>rOLz zRY&MAGdcg*F!>Z-u>*o4ienVwN?V~-X*;wl?SxLHYhkU@^{@`wiW`X=x)?E!+CO3H z(Ap19=u>ka>zWvmoPpuwM2`B{te-t?&<5fc@QT;qQAji;06D?W0Z3t9 zOQ90aP-#(XhWQ`TE}a>8WUZ_#MD3aNa<#CPT{aY+VT|^Ghy&xq#wcxHN8;f+!qs>L za)=5?x}|4gH_`sh@9d&h)Qg5{!0f}qRqk0W2Wlzhl)iMw3uRN)lskj^7#*pjR5{{T zKv^>=l|$<*fKo2fMitvq5cebc**byf)KTD@w&YUTHtU7AhtdrP$qT3Wcp0O@$7s3= zxm@MX_dLgY=n-%z{O|qLpX+tBj;N=+@x>?j6pIb8-Wp#j-=3jU%zuZ24REyMGF_B}rJhxA93ddWtV736*msCqFQSUZV&C*v(hUF|sIJQ(idi5dN)CgAEM}wNFcVoi>`a}`G|DyRV2DvZ{pfxb zLnFzO$stLDYNK%lGzut*NfLx#?smAjcGD}5H@7~2u)QCA&5{GsC;!#D6--yQ$&kcB z@_(0hhb&8&1sT(Rc0lIa)8QlyCNW8e0D(sUN=`VqQb|Ug zPDlGW&vM3P#KfA#YXD8#eJ-Zn*6zLSTN|I;xOpGGmqwK4G3m3l_qG3!#1po~dB$}u z&Ym)^U6xYuG?F9j0{7hT){W$lnu%^eljbX7l;1TjpFNYD^d*U7ZDm3`Pf2{J>m<)1 z4>x}T?ToW1)wP^QsnqTP5iBJM)Alf@+T)DSUX{|@de&#TjIvZW`dK_lQZc9CMxqDB z<*~L+mD&J8h<1Q!$}Ll%30>PHlHBExmp9B(LonF`>blOJMM8oWqlt`SUFV79jIFgR zY++p{IgNPe!pZi`f25+=<7^OmX2~D}QasC~#ywNp%<%xw(}BjoLh!zJ2aLc;FcCV& z(vCS-!#K;uFNk*m(d%!z4+P_)n~;8YmuF+vm!dn#;|%y`GE0bI-i44SQJ?+mG4;Bp zM`r6>_W)NXKh|#5I+%EnUVH@tVF$aGgB`1dUEIPSgMF#8mcepiEZdG{fv1f>uvQ_q zX?9?+V6?0oR?})(m#}Z~>yT}2L3=&3EhMjURXsQ-*3BmOc9PLa% best_score: + best_score = score + header_line_idx = i + return header_line_idx + +def detect_delimiter(path, sample_size=2048): + with open(path, 'r', encoding='utf-8', errors='ignore') as f: + sample = f.read(sample_size) + sniffer = csv.Sniffer() + try: + dialect = sniffer.sniff(sample) + return dialect.delimiter + except Exception: + for delim in [',', ';', '\t', '|']: + if delim in sample: + return delim + return ',' + + +def read_csv(path: str, sheet: str = None): + ext = os.path.splitext(path)[1].lower() + + try: + if ext in ['.csv']: + header_line = detect_header_line(path) + delimiter = detect_delimiter(path) + print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") + + df = pd.read_csv( + path, + header=header_line, + sep=delimiter, + encoding='utf-8', + low_memory=False, + thousands=',' + ) + + elif ext in ['.xlsx', '.xls']: + print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") + xls = pd.ExcelFile(path) + print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}") + + if sheet: + if sheet not in xls.sheet_names: + raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}") + print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'") + df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str) + df = df.dropna(how='all').dropna(axis=1, how='all') + + else: + print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...") + best_sheet = None + best_score = -1 + best_df = None + + for sheet_name in xls.sheet_names: + try: + temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str) + temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all') + + if len(temp_df) == 0 or len(temp_df.columns) < 2: + continue + + # hitung skor relevansi + text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1) + row_score = len(temp_df) + score = (row_score * 0.7) + (text_ratio * 100) + + if score > best_score: + best_score = score + best_sheet = sheet_name + best_df = temp_df + + except Exception as e: + print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}") + continue + + if best_df is not None: + print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}") + df = best_df + else: + raise ValueError("Tidak ada sheet valid yang dapat dibaca.") + + for col in df.columns: + if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any(): + df[col] = df[col].astype(str).str.replace(',', '', regex=False) + df[col] = pd.to_numeric(df[col], errors='ignore') + + else: + raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)") + + except Exception as e: + print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.") + df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',') + + df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')] + df.columns = [str(c).strip() for c in df.columns] + df = df.dropna(how='all') + + return df + diff --git a/services/upload_file/read_gdb/reader_gdb.py b/services/upload_file/readers/reader_gdb.py similarity index 100% rename from services/upload_file/read_gdb/reader_gdb.py rename to services/upload_file/readers/reader_gdb.py diff --git a/services/upload_file/read_mpk/reader_mpk.py b/services/upload_file/readers/reader_mpk.py similarity index 100% rename from services/upload_file/read_mpk/reader_mpk.py rename to services/upload_file/readers/reader_mpk.py diff --git a/services/upload_file/readers/reader_pdf.py b/services/upload_file/readers/reader_pdf.py new file mode 100644 index 0000000..972927f --- /dev/null +++ b/services/upload_file/readers/reader_pdf.py @@ -0,0 +1,168 @@ +import re +import pdfplumber +import pandas as pd +from services.upload_file.utils.pdf_cleaner import row_ratio, has_mixed_text_and_numbers, is_short_text_row, parse_page_selection, filter_geo_admin_column, cleaning_column +from services.upload_file.upload_exceptions import PDFReadError +from utils.logger_config import setup_logger + +logger = setup_logger(__name__) + +def detect_header_rows(rows): + if not rows: + return [] + + ratios = [row_ratio(r) for r in rows] + body_start_index = None + + for i in range(1, len(rows)): + row = rows[i] + if has_mixed_text_and_numbers(row): + body_start_index = i + break + if ratios[i] > 0.3: + body_start_index = i + break + if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row): + body_start_index = i + break + if ratios[i - 1] == 0 and ratios[i] > 0: + body_start_index = i + break + + if body_start_index is None: + body_start_index = len(rows) + + potential_headers = rows[:body_start_index] + body_filtered = rows[body_start_index:] + header_filtered = [] + for idx, row in enumerate(potential_headers): + if is_short_text_row(row): + if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0: + header_filtered.append(row) + else: + continue + else: + header_filtered.append(row) + + return header_filtered, body_filtered + + +def merge_multiline_header(header_rows): + final_header = [] + for col in zip(*header_rows): + val = next((v for v in reversed(col) if v and str(v).strip()), '') + val = str(val).replace('\n', ' ').strip() + final_header.append(val) + final_header = [v for v in final_header if v not in ['', None]] + return final_header + + +def read_pdf(path: str, page: str): + """ + Membaca tabel dari file PDF secara semi-otomatis menggunakan `pdfplumber`. + + Alur utama proses: + 1. **Buka file PDF** menggunakan pdfplumber. + 2. **Pilih halaman** berdasarkan input `page` (misalnya "1,3-5" untuk halaman 1 dan 3–5). + 3. **Deteksi tabel** di setiap halaman yang dipilih. + 4. **Ekstraksi tabel mentah** (list of list) dari setiap halaman. + 5. **Pisahkan baris header dan body** dengan fungsi `detect_header_rows()`. + 6. **Gabungkan header multi-baris** (misalnya tabel dengan dua baris judul kolom). + 7. **Bersihkan body tabel** menggunakan `cleaning_column()`: + - Menghapus kolom nomor urut. + - Menyesuaikan jumlah kolom dengan header. + 8. **Gabungkan hasil akhir** ke dalam format JSON dengan struktur: + { + "title": , + "columns": [...], + "rows": [...] + } + 9. **Filter tambahan** dengan `filter_geo_admin_column()` (khusus metadata geospasial). + 10. **Kembalikan hasil** berupa list JSON siap dikirim ke frontend API. + + Args: + path (str): Lokasi file PDF yang akan dibaca. + page (str): Nomor halaman atau rentang halaman, contoh: "1", "2-4", "1,3-5". + + Returns: + list[dict]: Daftar tabel hasil ekstraksi dengan struktur kolom dan baris. + + Raises: + PDFReadError: Jika terjadi kesalahan saat membaca atau parsing PDF. + """ + try: + pdf_path = path + selectedPage = page if page else "1" + tables_data = [] + + with pdfplumber.open(pdf_path) as pdf: + total_pages = len(pdf.pages) + selected_pages = parse_page_selection(selectedPage, total_pages) + + logger.info(f"[INFO] Total halaman PDF: {total_pages}") + logger.info(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}") + + for page_num in selected_pages: + pdf_page = pdf.pages[page_num - 1] + tables = pdf_page.find_tables() + logger.info(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi") + + for t in tables: + table = t.extract() + if len(table) > 2: + tables_data.append(table) + + logger.info(f"\nTotal tabel valid: {len(tables_data)}\n") + + header_only, body_only = [], [] + for tbl in tables_data: + head, body = detect_header_rows(tbl) + header_only.append(head) + body_only.append(body) + + clean_header = [merge_multiline_header(h) for h in header_only] + clean_body = [] + + for i, raw_body in enumerate(body_only): + con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body] + cleaned = cleaning_column(clean_header[i], [con_body]) + clean_body.append(cleaned[0]) + + parsed = [] + for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1): + parsed.append({ + "title": str(i), + "columns": cols, + "rows": rows + }) + + clean_parsed = filter_geo_admin_column(parsed) + return clean_parsed + + except Exception as e: + raise PDFReadError(f"Gagal membaca PDF: {e}", code=422) + + +def convert_df(payload): + try: + if "columns" not in payload or "rows" not in payload: + raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.") + + if not isinstance(payload["columns"], list): + raise TypeError("'columns' harus berupa list.") + if not isinstance(payload["rows"], list): + raise TypeError("'rows' harus berupa list.") + + for i, row in enumerate(payload["rows"]): + if len(row) != len(payload["columns"]): + raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.") + + df = pd.DataFrame(payload["rows"], columns=payload["columns"]) + + if "title" in payload: + df.attrs["title"] = payload["title"] + + return df + + except Exception as e: + raise PDFReadError(f"Gagal konversi payload ke DataFrame: {e}", code=400) diff --git a/services/upload_file/read_shp/reader_shp.py b/services/upload_file/readers/reader_shp.py similarity index 100% rename from services/upload_file/read_shp/reader_shp.py rename to services/upload_file/readers/reader_shp.py diff --git a/services/upload_file/upload.py b/services/upload_file/upload.py index dbf9fdd..1a77b42 100644 --- a/services/upload_file/upload.py +++ b/services/upload_file/upload.py @@ -1,27 +1,30 @@ +import json import os import pandas as pd import geopandas as gpd import numpy as np +import re import zipfile from shapely.geometry.base import BaseGeometry from shapely.geometry import base as shapely_base from fastapi import File, Form, UploadFile, HTTPException -from fastapi.responses import JSONResponse from core.config import UPLOAD_FOLDER, MAX_FILE_MB, VALID_WKT_PREFIXES -from services.upload_file.read_csv.reader_csv import read_csv -from services.upload_file.read_shp.reader_shp import read_shp -from services.upload_file.read_gdb.reader_gdb import read_gdb -from services.upload_file.read_mpk.reader_mpk import read_mpk -from services.upload_file.read_pdf.reader_pdf import convert_df, read_pdf -from services.upload_file.geom_detector.geometry_detector import detect_and_build_geometry -from services.upload_file.geom_detector.geometry_detector import attach_polygon_geometry_auto +from services.upload_file.readers.reader_csv import read_csv +from services.upload_file.readers.reader_shp import read_shp +from services.upload_file.readers.reader_gdb import read_gdb +from services.upload_file.readers.reader_mpk import read_mpk +from services.upload_file.readers.reader_pdf import convert_df, read_pdf +from services.upload_file.utils.geometry_detector import detect_and_build_geometry +from services.upload_file.utils.geometry_detector import attach_polygon_geometry_auto from database.connection import engine from database.models import Base from pydantic import BaseModel from typing import List, Optional from shapely import wkt from sqlalchemy import text -Base.metadata.create_all(bind=engine) +from datetime import datetime +from response import successRes, errorRes +# Base.metadata.create_all(bind=engine) def is_geom_empty(g): @@ -78,7 +81,7 @@ def process_data(df: pd.DataFrame, ext: str): print(f"[INFO] Tipe Geometry: {geom_type}") print(f"[INFO] Jumlah geometry kosong: {null_geom}") else: - response = { + res = { "message": "Tidak menemukan tabel yang relevan.", "file_type": ext, "rows": 0, @@ -91,7 +94,7 @@ def process_data(df: pd.DataFrame, ext: str): "preview": [] } - return JSONResponse(content=response) + return errorRes(message="Tidak berhasil mencocokan geometry pada tabel." ,details=res, status_code=422) result = result.replace([pd.NA, float('inf'), float('-inf')], None) if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns: @@ -116,7 +119,8 @@ def process_data(df: pd.DataFrame, ext: str): else: warning_examples = [] - preview_data = result.to_dict(orient="records") + preview_data = result.head(10).to_dict(orient="records") + # preview_data = result.to_dict(orient="records") preview_safe = [ {k: safe_json(v) for k, v in row.items()} for row in preview_data @@ -139,7 +143,7 @@ def process_data(df: pd.DataFrame, ext: str): "preview": preview_safe } - # return JSONResponse(content=response) + # return successRes(content=response) return response @@ -157,7 +161,7 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] = contents = await file.read() size_mb = len(contents) / (1024*1024) if size_mb > MAX_FILE_MB: - raise HTTPException(status_code=413, detail="Ukuran File Terlalu Besar") + raise errorRes(status_code=413, message="Ukuran File Terlalu Besar") tmp_path = UPLOAD_FOLDER / fname with open(tmp_path, "wb") as f: f.write(contents) @@ -174,19 +178,19 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] = elif ext == ".pdf": tbl = read_pdf(tmp_path, page) if len(tbl) == 0: - response = { + res = { "message": "Tidak ditemukan tabel valid", - "tables": tbl, + "tables": {}, "file_type": ext } - return JSONResponse(content=response) + return successRes(message="Tidak ditemukan tabel valid", data=res) elif len(tbl) > 1: - response = { + res = { "message": "File berhasil dibaca dan dianalisis.", "tables": tbl, "file_type": ext } - return JSONResponse(content=response) + return successRes(data=res, message="File berhasil dibaca dan dianalisis.") else: df = convert_df(tbl[0]) elif ext == ".zip": @@ -201,25 +205,29 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] = df = read_gdb(str(tmp_path)) else: - raise HTTPException( + raise errorRes( status_code=400, - detail="ZIP file tidak mengandung SHP atau GDB yang valid." + message="ZIP file tidak mengandung SHP atau GDB yang valid." ) else: - raise HTTPException(status_code=400, detail="Unsupported file type") + raise errorRes(status_code=400, message="Unsupported file type") if df is None or (hasattr(df, "empty") and df.empty): - return JSONResponse({"error": "No valid table detected"}, status_code=400) + return successRes(message="File berhasil dibaca, Tetapi tidak ditemukan tabel valid") res = process_data(df, ext) tmp_path.unlink(missing_ok=True) - return JSONResponse(content=res) + return successRes(data=res) except Exception as e: print(f"[ERROR] {e}") - return JSONResponse({"error": str(e)}, status_code=500) + return errorRes( + message="Internal Server Error", + details=str(e), + status_code=500 + ) # finally: # db_session.close() @@ -237,16 +245,15 @@ async def handle_process_pdf(payload: PdfRequest): try: df = convert_df(payload.model_dump()) if df is None or (hasattr(df, "empty") and df.empty): - return JSONResponse({"error": "No valid table detected"}, status_code=400) + return errorRes(message="Tidak ada tabel") res = process_data(df, '.pdf') - - return JSONResponse(content=res) + return successRes(data=res) except Exception as e: print(f"[ERROR] {e}") - return JSONResponse({"error": str(e)}, status_code=500) + return errorRes(message="Internal Server Error", details= str(e), status_code=500) # finally: # db_session.close() @@ -263,39 +270,351 @@ class UploadRequest(BaseModel): rows: List[dict] columns: List[str] -def handle_to_postgis(payload: UploadRequest): - try: - table_name = payload.title.lower().replace(" ", "_").replace("-","_") +# import time +# def handle_to_postgis(payload: UploadRequest): +# # time.sleep(2) +# # return { +# # "table_name": 'test', +# # "status": "success", +# # "message": f"Tabel test berhasil diunggah ke PostGIS.", +# # "total_rows": 999, +# # "geometry_type": 'POINT' +# # } +# try: +# table_name = payload.title.lower().replace(" ", "_").replace("-","_") +# df = pd.DataFrame(payload.rows) +# print(f"[INFO] Diterima {len(df)} baris data dari frontend.") + +# if "geometry" in df.columns: +# df["geometry"] = df["geometry"].apply( +# lambda g: wkt.loads(g) if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) else None +# ) +# gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") +# else: +# raise HTTPException(status_code=400, detail="Kolom geometry tidak ditemukan dalam data.") + +# with engine.begin() as conn: +# conn.execute(text(f"DROP TABLE IF EXISTS {table_name}")) + +# gdf.to_postgis(table_name, engine, if_exists="replace", index=False) + +# with engine.begin() as conn: +# conn.execute(text(f'ALTER TABLE "{table_name}" ADD COLUMN _id SERIAL PRIMARY KEY;')) + +# print(f"[INFO] Tabel '{table_name}' berhasil dibuat di PostGIS ({len(gdf)} baris).") + +# return { +# "table_name": table_name, +# "status": "success", +# "message": f"Tabel '{table_name}' berhasil diunggah ke PostGIS.", +# "total_rows": len(gdf), +# "geometry_type": list(gdf.geom_type.unique()) +# } + +# except Exception as e: +# print(f"[ERROR] Gagal upload ke PostGIS: {e}") +# raise HTTPException(status_code=500, detail=str(e)) + + + + + + + + +# VALID_WKT_PREFIXES = ("POINT", "LINESTRING", "POLYGON", "MULTIPOLYGON", "MULTILINESTRING") + +# def handle_to_postgis(payload: UploadRequest, user_id: int = 2): +# try: +# table_name = "test_partition" +# df = pd.DataFrame(payload.rows) +# print(f"[INFO] Diterima {len(df)} baris data dari frontend.") + +# if "geometry" not in df.columns: +# raise HTTPException(status_code=400, detail="Kolom geometry tidak ditemukan dalam data.") + +# df["geometry"] = df["geometry"].apply( +# lambda g: wkt.loads(g) if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) else None +# ) + +# gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") + +# insert_count = 0 +# with engine.begin() as conn: + +# # 💡 Tambahkan blok auto-create partisi di sini +# conn.execute(text(f""" +# DO $$ +# BEGIN +# IF NOT EXISTS ( +# SELECT 1 FROM pg_tables WHERE tablename = 'test_partition_user_{user_id}' +# ) THEN +# EXECUTE format(' +# CREATE TABLE test_partition_user_%s +# PARTITION OF test_partition +# FOR VALUES IN (%s); +# ', {user_id}, {user_id}); +# EXECUTE format('CREATE INDEX ON test_partition_user_%s USING GIST (geom);', {user_id}); +# EXECUTE format('CREATE INDEX ON test_partition_user_%s USING GIN (properties);', {user_id}); +# END IF; +# END +# $$; +# """)) + +# # Lanjut insert data seperti biasa +# for _, row in gdf.iterrows(): +# geom_wkt = row["geometry"].wkt if row["geometry"] is not None else None +# properties = row.drop(labels=["geometry"]).to_dict() + +# conn.execute( +# text(""" +# INSERT INTO test_partition (user_id, geom, properties, created_at) +# VALUES (:user_id, ST_Force2D(ST_GeomFromText(:geom, 4326)), CAST(:properties AS jsonb), :created_at) +# """), +# { +# "user_id": user_id, +# "geom": geom_wkt, +# "properties": json.dumps(properties), +# "created_at": datetime.utcnow() +# } +# ) +# insert_count += 1 + +# print(f"[INFO] Berhasil memasukkan {insert_count} baris ke partisi user_id={user_id}.") + +# return { +# "table_name": table_name, +# "user_id": user_id, +# "status": "success", +# "message": f"Data berhasil ditambahkan ke partisi user_id={user_id}.", +# "total_rows": insert_count, +# "geometry_type": list(gdf.geom_type.unique()) +# } + +# except Exception as e: +# print(f"[ERROR] Gagal upload ke PostGIS partition: {e}") +# raise HTTPException(status_code=500, detail=str(e)) + + + + +# Daftar prefix WKT yang valid +VALID_WKT_PREFIXES = ("POINT", "LINESTRING", "POLYGON", "MULTIPOLYGON", "MULTILINESTRING") + + +def slugify(value: str) -> str: + """Mengubah judul dataset jadi nama aman untuk VIEW""" + return re.sub(r'[^a-zA-Z0-9]+', '_', value.lower()).strip('_') + + +# async def create_dataset_view_from_metadata(conn, metadata_id: int, user_id: int, title: str): +# """Membuat VIEW PostgreSQL berdasarkan metadata dataset dan registrasi geometry untuk QGIS.""" +# norm_title = slugify(title) +# view_name = f"v_user_{user_id}_{norm_title}" +# base_table = f"test_partition_user_{user_id}" + +# # 1️⃣ Hapus view lama jika ada +# drop_query = text(f"DROP VIEW IF EXISTS {view_name} CASCADE;") +# await conn.execute(drop_query) + +# # 2️⃣ Buat view baru +# create_view_query = text(f""" +# CREATE OR REPLACE VIEW {view_name} AS +# SELECT p.*, m.title, m.year, m.description +# FROM {base_table} p +# JOIN dataset_metadata m ON m.id = p.metadata_id +# WHERE p.metadata_id = {metadata_id}; +# """) +# await conn.execute(create_view_query) + +# # 3️⃣ Daftarkan geometry column agar QGIS mengenali layer ini +# # (gunakan Populate_Geometry_Columns jika PostGIS >= 3) +# populate_query = text(f"SELECT Populate_Geometry_Columns('{view_name}'::regclass);") +# await conn.execute(populate_query) + +# print(f"[INFO] VIEW {view_name} berhasil dibuat dan geometry terdaftar.") + + +async def create_dataset_view_from_metadata(conn, metadata_id: int, user_id: int, title: str): + """Buat VIEW dinamis sesuai struktur atribut JSON pada dataset (hindari duplikasi nama kolom).""" + norm_title = slugify(title) + view_name = f"v_user_{user_id}_{norm_title}" + base_table = f"test_partition_user_{user_id}" + + # Ambil daftar field dari metadata + result = await conn.execute(text("SELECT fields FROM dataset_metadata WHERE id=:mid"), {"mid": metadata_id}) + fields_json = result.scalar_one_or_none() + + # --- daftar kolom bawaan dari tabel utama + base_columns = {"id", "user_id", "metadata_id", "geom"} + + columns_sql = "" + field_list = [] + + if fields_json: + try: + # handle jika data sudah berupa list atau string JSON + if isinstance(fields_json, str): + field_list = json.loads(fields_json) + elif isinstance(fields_json, list): + field_list = fields_json + else: + raise ValueError(f"Tipe data fields_json tidak dikenali: {type(fields_json)}") + + for f in field_list: + safe_col = slugify(f) + # Hindari duplikat nama dengan kolom utama + if safe_col in base_columns: + alias_name = f"attr_{safe_col}" + else: + alias_name = safe_col + + columns_sql += f", p.attributes->>'{f}' AS {alias_name}" + + except Exception as e: + print(f"[WARN] Gagal parse field list metadata: {e}") + + # 1️⃣ Drop view lama + await conn.execute(text(f"DROP VIEW IF EXISTS {view_name} CASCADE;")) + + # 2️⃣ Buat view baru dinamis + create_view_query = f""" + CREATE OR REPLACE VIEW {view_name} AS + SELECT p.id, p.user_id, p.metadata_id, p.geom + {columns_sql}, + m.title, m.year, m.description + FROM {base_table} p + JOIN dataset_metadata m ON m.id = p.metadata_id + WHERE p.metadata_id = {metadata_id}; + """ + await conn.execute(text(create_view_query)) + + # 3️⃣ Register geometry untuk QGIS + await conn.execute(text(f"SELECT Populate_Geometry_Columns('{view_name}'::regclass);")) + + print(f"[INFO] VIEW {view_name} berhasil dibuat (kolom dinamis: {field_list if field_list else '(none)'}).") + + +async def handle_to_postgis(payload, engine, user_id: int = 3): + """ + Menangani upload data spasial ke PostGIS (dengan partition per user). + - Jika partisi belum ada, akan dibuat otomatis + - Metadata dataset disimpan di tabel dataset_metadata + - Data spasial dimasukkan ke tabel partisi (test_partition_user_{id}) + - VIEW otomatis dibuat untuk QGIS + """ + + try: df = pd.DataFrame(payload.rows) print(f"[INFO] Diterima {len(df)} baris data dari frontend.") - if "geometry" in df.columns: - df["geometry"] = df["geometry"].apply( - lambda g: wkt.loads(g) if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) else None + # --- Validasi kolom geometry --- + if "geometry" not in df.columns: + raise errorRes(status_code=400, message="Kolom 'geometry' tidak ditemukan dalam data.") + + # --- Parsing geometry ke objek shapely --- + df["geometry"] = df["geometry"].apply( + lambda g: wkt.loads(g) + if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) + else None + ) + + # --- Buat GeoDataFrame --- + gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") + + # --- Metadata info dari payload --- + # dataset_title = getattr(payload, "dataset_title", None) + # dataset_year = getattr(payload, "dataset_year", None) + # dataset_desc = getattr(payload, "dataset_description", None) + dataset_title = "longsor 2020" + dataset_year = 2020 + dataset_desc = "test metadata" + + if not dataset_title: + raise errorRes(status_code=400, detail="Field 'dataset_title' wajib ada untuk metadata.") + + async with engine.begin() as conn: + fields = [col for col in df.columns if col != "geometry"] + # 💾 1️⃣ Simpan Metadata Dataset + print("[INFO] Menyimpan metadata dataset...") + result = await conn.execute( + text(""" + INSERT INTO dataset_metadata (user_id, title, year, description, fields, created_at) + VALUES (:user_id, :title, :year, :desc, :fields, :created_at) + RETURNING id; + """), + { + "user_id": user_id, + "title": dataset_title, + "year": dataset_year, + "desc": dataset_desc, + "fields": json.dumps(fields), + "created_at": datetime.utcnow(), + }, ) - gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") - else: - raise HTTPException(status_code=400, detail="Kolom geometry tidak ditemukan dalam data.") + metadata_id = result.scalar_one() + print(f"[INFO] Metadata disimpan dengan ID {metadata_id}") - with engine.begin() as conn: - conn.execute(text(f"DROP TABLE IF EXISTS {table_name}")) + # ⚙️ 2️⃣ Auto-create Partisi Jika Belum Ada + print(f"[INFO] Memastikan partisi test_partition_user_{user_id} tersedia...") + await conn.execute( + text(f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_tables WHERE tablename = 'test_partition_user_{user_id}' + ) THEN + EXECUTE format(' + CREATE TABLE test_partition_user_%s + PARTITION OF test_partition + FOR VALUES IN (%s); + ', {user_id}, {user_id}); + EXECUTE format('CREATE INDEX IF NOT EXISTS idx_partition_user_%s_geom ON test_partition_user_%s USING GIST (geom);', {user_id}, {user_id}); + EXECUTE format('CREATE INDEX IF NOT EXISTS idx_partition_user_%s_metadata ON test_partition_user_%s (metadata_id);', {user_id}, {user_id}); + END IF; + END + $$; + """) + ) - gdf.to_postgis(table_name, engine, if_exists="replace", index=False) + # 🧩 3️⃣ Insert Data Spasial ke Partisi + print(f"[INFO] Memasukkan data ke test_partition_user_{user_id} ...") + insert_count = 0 + for _, row in gdf.iterrows(): + geom_wkt = row["geometry"].wkt if row["geometry"] is not None else None + attributes = row.drop(labels=["geometry"]).to_dict() - with engine.begin() as conn: - conn.execute(text(f'ALTER TABLE "{table_name}" ADD COLUMN _id SERIAL PRIMARY KEY;')) + await conn.execute( + text(""" + INSERT INTO test_partition (user_id, metadata_id, geom, attributes, created_at) + VALUES (:user_id, :metadata_id, ST_Force2D(ST_GeomFromText(:geom, 4326)), + CAST(:attr AS jsonb), :created_at); + """), + { + "user_id": user_id, + "metadata_id": metadata_id, + "geom": geom_wkt, + "attr": json.dumps(attributes), + "created_at": datetime.utcnow(), + }, + ) + insert_count += 1 - print(f"[INFO] Tabel '{table_name}' berhasil dibuat di PostGIS ({len(gdf)} baris).") + # 🧩 4️⃣ Membuat VIEW untuk dataset baru di QGIS + await create_dataset_view_from_metadata(conn, metadata_id, user_id, dataset_title) + + print(f"[INFO] ✅ Berhasil memasukkan {insert_count} baris ke partisi user_id={user_id} (metadata_id={metadata_id}).") return { - "table_name": table_name, "status": "success", - "message": f"Tabel '{table_name}' berhasil diunggah ke PostGIS.", - "total_rows": len(gdf), - "geometry_type": list(gdf.geom_type.unique()) + "user_id": user_id, + "metadata_id": metadata_id, + "dataset_title": dataset_title, + "inserted_rows": insert_count, + "geometry_type": list(gdf.geom_type.unique()), } except Exception as e: - print(f"[ERROR] Gagal upload ke PostGIS: {e}") - raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file + print(f"[ERROR] Gagal upload ke PostGIS partition: {e}") + raise errorRes(status_code=500, message="Gagal upload ke PostGIS partition", details=str(e)) \ No newline at end of file diff --git a/services/upload_file/upload_exceptions.py b/services/upload_file/upload_exceptions.py new file mode 100644 index 0000000..ae496ce --- /dev/null +++ b/services/upload_file/upload_exceptions.py @@ -0,0 +1,9 @@ +class PDFReadError(Exception): + """Exception khusus untuk kesalahan saat membaca file PDF.""" + def __init__(self, message: str, code: int = 400): + super().__init__(message) + self.message = message + self.code = code + + def to_dict(self): + return {"error": self.message, "code": self.code} diff --git a/services/upload_file/utils/__pycache__/geometry_detector.cpython-39.pyc b/services/upload_file/utils/__pycache__/geometry_detector.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a212a8cb9ee38d12010c6679665ea253576fc04 GIT binary patch literal 13831 zcmc&*TWlNId7c}|Aw^NLY|EA`d&iE~7Huu%>?WIJ<8{^-*}Jh#8qEfF%rNuW>%HXRu2AbJYR@BuRgUwti*BmMhiFBed z+}u>!B;v`&NON;(vxuh}Tbf%-TbrY$(dM?&w&wQI_U2e=OqQw=)9=U#(<02=k=ele zYH6Hh+29SSw1ef?5WW-a0NcbyZpfvb?4Y&F+Ql}%udpp_>y1R|IX23+p^hQuxVyBQ zO|daHj=Fo;4mN?3kFgi5WPPtC*PmxQ(QX%e4(;}#zR}p@oa zpk7;YQI=}+R>g9hKwIc|9`Xk(+%jFOY}r**M@IIT=~$<%R?~9%ay}6Z&(55jnVl&f zpDCX>R(^B#OmEKI@z-b09u4#pGjm7Fs2HS9&m4~;M~_h?p9(Z)IcAVvvKk$3E}C{J zM;YaA+7s#NZaPTYX45QFrBY5ryG5RU0=d(ttwvr8)FrDz-BwWBMYnoXw-UsyqLRFh zy5>CUn)97DR%-{^Qp-i}-HdXSMsB&A>FJ10Q;C zl8Dex#m`{~YSN10%WKlQbV9m3dr4VQS2SO$%SattN%)C1WnEfR@za>hlywDPmA+q* z+{Am*g3L6QxS{*n2@u4p%*XwNulVX=NtUi;uZ&;G`U*%P>CK$A?25^2#!{=%Y8uO? zT{YTf1EgS7T9ww4(XrjmlHu0Ow$ZSzS`EWw&6;gY(x&4iDzwymARB>P{!pH-;Cgod zXnnMvm+*5&5%|*O+=}eWH>Io6TQn1eNpsR&xp-Fs#VLhDK>|aY&Rw~rS6Yp--QrC? zfrcMmn10i-xHH`}E7RwBt8P_XXS&mFv`kiRxh))Ptz}Q6%GDZ(HZEDHHLU55TWdJe z)rd|(EG`vaYc-u971nWrgo6vz4ib&lHH!x+ZnYa`#R^o@UPf(&Ed(jCHVbVq-O8fHP?>hL zlo18xM$K^p75#}N^XD)dZXgIYJv5~q$aaPiNQx#8%Ncn>9+h?c;|1hB@(_+wleZ8M zx!k}})InIQ@`~)rARG-jAT2>E`&eR5LT<7z_b^L|T)i)MmZj$;mH{aZc&~z#Xm^*K zS}&`lv zCy8ykEAxFQzpL<__ylq#+&m(haPu;UXOh5|tl6&Qom$9Ve$5eZ-(d$zn4EvxK6%9E*pX$5X&`f*W?CCC*xYJVJ@4CeVa|pm8Q2DKW94v7ely;k9&Jlx+#xI zHx(w|Qt4QWB%qy2i(BPN$89YvfDEZ=kg=}gK33dwHkcEx=3P0EuisUJWYctTLxVK0 zRTo{zT(C!Xkz3AUtHHuOHmJRRxM4QunR(=8q(WldDOOQ#TD)rY-kz}DLn|P$ATBmB zFkj+XY!~JZK7t=+&S{xqM65v$1;EAx_-iQrG}WMs5+ual1zN*eaIuOMeakI7(8FaQ zrY^Rwk*;I5+l^(^r&;Dxf`Is|)P*kEV8Q$db^AHWi&(USZK3+n1{|hxgUSbbmi!Vj zoGb!K(dFEIO&tMQ@v}%{SPh41ak@Z#;(h=bL|B3=e-8_Gu@kPmsyI2?1z#fZzJjl_ zi_)$dig{mJS1ETbfl@`}g5-P!xnWA{OC{oz4{m-{;;v|sigQIu?@OgwhI$jDqu;e8 z#!;)%ipBM zww8IY#x`6SSwj8Xr=*p{m2Yj-iSn~yK4y_((uz=NE+1M+`#O3oiZOLRy{@n!F-GyW ze5K&0Fiu*uPp_cfl>*DG4frWsR9aOQtIC9XfhQl4bO6$7gXVwPUrgizY?0lDP z@-yF*K2TONb{6?L=`Tr;Z}U5oU4;}?Dk5_nnX%@H+VhtRpQ8_zf&Y=xC^)B=K|QFIli-=^e)C4*s3`i$Nag5%P~96p z`$ibCe)OZGcQsC>e360$1h5_E9oP=)RkIN!aX<~6s(dw2%&S3i0V2G`1I51j&{1yO zhDOW&Dmrou1mZG)#Ite|WKFh$rhHml4_$s4WY2#Cb^1t~$hu3~Cz0qs+vsFvoFfcO z;;T4EjTgXIB#jRg0XXH-3cz2LV)*4#lrgaGe5iw>K{MV9%0F z_YP_}WJQNdqNPOt6z(GzoKk3qL!m}fZnqlC)t2q_5$+%=(#g>RolheS?`mDatt>%> zm+C46W(i-7ge?ieo>WSRUmd0Cwahx1QK~yY#QRO$WnV?8eUMlsu|~o>OZuNMxl}T- zSZY;S(U7Y@l$-ELOpf>o3f;ka4*I4x27rlWZb8lSp;bn?5VGCe1nHV2cGrg30Lz9N zs2}E{_1cg-OggD(E!WYHE3a(|TTm_uX*S4mx73v+Rxu>z4NVm@9DZnqDL-{v;k(c- zg`P*kouM3ebCkDbU8-+oo5rxxwb6CyE7DDAZ5taQfy&=!o7t8dB*eq^Lq2q8;+R;M zm>KlxjhSri&Lk-ijZw@-XWK$EmB#eIKgB$T*!FnckLhbH?CWiuhbFn(Z!60avb2(x zC2t&S9{1BE=Bdux#Gq0u>37q1Cj1_f4uNH`R7bw8@+qQ>`sTGUta3B058*gLb?b_V z?WCA5l~s2K_IUy)ongED3{(q+|D})kn6xs0K6d)DKfs=22HOqGAnRvI1zy{Q{RC~X zO;BYNe#dl5`>QAg|$>iKfr2v3Xb3AQKd{XfzBp1$5c7We*`mU~4@T2qGY?Q8k` zlUriP_KE$(ZszxUzm+lY*JJ%NST;?oId2KDB)NZ65TVJum&kH|DJqKj-y3K7X_j>T#wtf+k0RiMWsc`sdjqj>V%W#fbZdn{&H z*8u{Mg%wvZ(DNy))t!wo=?xeQ0604^d8Q0c$rt*VA*oO#=!joAWO=~o9|w2>aYr^a zDa`8%2V&Q2TYGyh%}-)oK7KW~!z_RKVi}s+fl8~znZLX@e`&r5T#)R}An8~puPh3! ziyuPH16)ub1b!G!!KD7ME0i}8WtuV5{cw3e3b<5v|yxT5R5 zE4nUL#J@IB#*MZi0Z}jdf|&BfC+|AxW?~nF zZSbLdv2U-5x=Bl%!_OgQ-5a6Z4&gx(?-AcrmiP55aU{?=p^CFp=3a1|jZ3E>O2*fOiTaU`h z{Sn3c{L_#I#Evm39AeYt51T$8s*TfUI}NuMCGrLD;}_pPI$OMKoHDCs1FjMd6S^9C zPvTo@IMzIhYb45EF+!7kj3!4!6ncS%DR!KK6BLj-%jragksmv=Vkp3SH5&9njhRbe zIdDA)_^>-5NtsZ35?xP)wqhtY^M(BG&@fc1FbT7#dZb4|3t9Dlc&K zaD6#Q(FmPJLmXz1WW3e3O_-?gt_a|PU#6xvDcDEBehP@dgWU0RXWl$pEFV4d#?iT% zIsOKfEK)!y8~-wb{9uUatsq{?h)Gvk@LlmgqKp)_hrBt6v}}q5YLzXN(t>qTntv75 z0@>n(mz2_7_YXgVbV%dBOECjMF!or&{vL`Q5=v4s3kZP#hf&yAqw=VdMQjwY5ycR; zmm&|}*R<{OI7;;72)sCCk!=_3FL_^2P&$d+y;??|gnb4e0#L=M04%g91q8b%2Jg?G zU}T8Xbc1)w6%#(Dd&G{B13dG(1AwHD>!(nbz>mn#`7;D$_rP{hK2S-ssY9#rWtj3W zx+<(1D$zaw{3lB;Y$7TlV3H+7Nir%)qE_ky#ig$vebXN(Eb~4fN24a* z1+XGnO90psu2#>$N2!tXoGcMP3BV3xU^JEGd^y(71%^IU^pawI(_XgBt%U;aAxP8p z#xEW}3QzN8Xrs-J=^7}rIPj*LvELinAIfO*3L1^plG!MD&s{umj4InzfYoqKF@P+O zY(xd~@OKvq1#eeD&-hmSU_nK{J;!vRT z*+6N*l~2q8$`&cZ4*w>`;$KG)WZ~IokM1YL{_<>6}Zzd)TnJD+3`ErafXQ$*kkN68u_<~l4+(kJ1y z(Qc{aONZ}Vt0(HodI}V-laFUX5w<(1)y>?59<(Zx^|=mfkm*7XO&0^zHf??mT|*-U zI%-v|>wVXZra;0-2<%TFBAl61E94x)zc1sGEx@Vd?A7F}&scnm#oq4S& z;HE(K#aPgKe+^why%L(nuOog}0b;~Cry+X8@{33w#Ri5t@Sf0CAj^t;JUw!^>+BTc z+?4~(1fApufb=@V|3eu}OgYd)eh&}2Vz=RiTQoU_W-Lf@i*+hikOd{eb6y47WCP>LSj8?g*)sPFuUDsAO7%S9RHwg$RwUTmmKr~?OfIMVw1B6fk zPT&fx%WDGwg*HVHlX@8NQB>>es6}8?Tx+CTi(H|JsMhyUi-4-Q*5+<4f~%5It)BzK zP4?8<64gR3YNbRig(p#qK&ZGcfKFmw$VDwZsx^gLdQYv|0>LbkJtR`nq^QI*-QTk^^vPHWg71Y|V93bZ2Nv}xTGR*pVR}zSuw#96qlE2TP%uNOzD>*w@62u~D*)8$V^oLjXOqJJocF#a z;GGK)tL7qkxm?{XDa=HjS}_v36JH zAE$V~x%Or(7e4$9JZn%GW0BhhlgcJu5A;Zh7*XnZ3#Eh>jd%Q-q(|F0J-8$HpN_BKe!LMR;}m}4>XO+mc-t?Ym^t?5DS)tXf3E(m9uRmH_HT{HOn#1CXgPV-oKf zYw#Z%-L;5LH}(OS{_Ha_53_B;%Wq85V(|bq<{{nQ(cR{FZ^Df~KKIrm7zU}VbNC5d zECfGS0e#-Z3z;0UamOq=;m6^g1oVYEU*Rb)S_FMQMEg&eh`=rW`AO60#M5||Znsj8 zPwe*kvv$k>=_C8}pSjN;ePo|seC9rX{E>Zr>5+YsZ0Md8@q&#vlN=88o;Q<%Da07% z3pAYI5*zG=OosR3LA*mxOFpnF#XC#yjrc-`cojYMe3XZBom@H5bK}VI zvs~ySnrB=g|Ch%s?RVm({h#!HEABl=mt!+4Hq(=MR^!01K8~4zW1qqYhPmB04?4rH zxj2gi6f5`!36X(`NKeE@#gj0l4#&F-WS9KYP&;9@k}V5wBGgD;gc1pL98Vh(ym*sT zN+{IbM|ODD@DucW?>vIMQc8-4f}DQ}Nn!o*mnnvYNO`>nfF%`%(<2xDhY0XA-dc9V zqoC4|c*j%5+REa25ZnPOZ(XC8_sjekD$|p081`jWtGF9VNH8RxHiwVtNH_cr#*BRX zWYudLo@m}rYQy9Y_~g^k;R8K}>-hoAH(9IVA%TUB-%D%w5(TRi+(3{Y7jHmARD@sz z{yqhwCasnaP>`iykb=LWKs=gy9kD=#5D6_@GE{>hLO07o5v}0A5jY`a`3KbI8x$;3 zC5<*CP(3^wR{@_OjYa~I;lGda{H74nBz&cW2jvT8!Yp+GG{L?urIxHp2x>wP7Ll+F zHN&znf-;Jv1udeTQ_6OuWpp^Wk)nDq-J4B-qzwL>#Zm@Y(G=kJk0+cBj;s7+j$ueWP#Td z^HT!i22KVGBt*l8g>S8~T!@VoPK+mXY$cC;zL>=MZ^kdILmD&xz<)>(un*K_s z?WX|^`755?Oj6%-+akFg#xnE@QPT9O6v0|Zr1&E6mEBF^K)EYPC+0p5bXKsLY%n6d z|D(Et^cs>pi!3v?jQd8{_>S<9+{Jcj4Si^Rvs( zDmL4-mgKWlTRBm3ClJaMfwE+0ML9ARVY|wdod$Q)2ZlI=!yeVqJV zFbqsx*;BF7V5^lz}SY{He)=}45HnJ_jiMS>yO(V;U<#7neAOXE& zV42CGoDW#7$}~|`Q<-xM%4#Ih?%S3w5@PPQ#U;}H{~{UPGf*#JY{&hvu+xg?E=Ebi zkL0Df;UZPcha`@${CGng#RfolA@x_H0C3H>3b=A~k8)PY7VW0u$YV~dVz)kKZE$>A z<5rxfGscw}CtWK}<~%1(T8ZeEburr!owQqw6J7DlTH-l1JzlIOv)3`GoHt?~+*Rfq zm%GlQn5|)(#rc|NF4oYF&>rD78AwtJ&2a)iW_%2#_>Hj&Wq<(P6t;ncr*(Z~=|E7= zW7Q5pW$Yv4wy`ZJ@eIZ_ORA>$F6ye-2k!&sjol1<4q*^_bGDJ93*yX6R5^pJd+Mey z>v5;Gf{FS3ePo5rNycircKzqCUU|Fz`juBMqxk8S_uhT;?aPa>%!PbyehlGNXrppy z&+ac`OcGNJNJJ1FBljM$A@5+6!+s#amjL@OXp~m~S1Vg0;I!KEhhn`s_so?OMa97jfW&&H)}b{t7uK~IBFKKPM;DI#fQ`4=kW zJn1ShP2?~B*>|C~d;kjoNIaCpx(1FA18gB zE3-q&O#9=^F-_fAPMvn+h1!Vh=+Xr9MI?DjUldk&2Iid!JGaE#02wwQY#cNGV6GEj zna1k_CZ58I2YqF?M6$;Tl`m3=0B6^}^h-?5wfs9e&e;ar4jmuP)(431W6i$g9>##2 zLAC~&EKd!b#z#ieBvNxSb0m<=#gCCaV5;0VLU6&`FcEKH@FvY78&oYQs~fhp1JD*h z<6R>}-?my&#e5e8Gb;%Xp=jfI;Ku8DTU@|C%0^?uJJdgM+w`c?YA7T;coKQj!)kq} z5_)bN+@Z}j#Ov6~>#qQWy#oVBln-mb|Ac7Q7Vh9K&c2Be9SlQ_%}F4=%uIOsaqa6# zt!=H=Q*6r2qku`XVv4u03VyI6g9u@4z~A!M0yU=12codGK#r&iMWrRAp{iu?5fpoJ zxChRT7@LSbDa_g=6{0{2=5xy-jVA*NHOP84tbl#n5Fd2UriHYa*_efRbkl^IIlTfv z4DXaVSs~w#;QX$!{-tfCrHx-RL<-D!&sbkT`f>qfcM>bgw9h8%&t1@S|1mOmf#z!e zXLASs56ZHseIQh|2$UCU8i3uxjk8OV-UR3NV3DFCjm6WbXe`dcL%>Sd;75~*86Xmh z6x#@9hPZ_K5U@1@2cD9(p-pcOdbKn)E9IyQBv_gB=A*Zt_Pkdg zv9rohY!30Y_AKO2R8a)&$cL{jhLVSP`y#4f{TpeW(>E#&mG(oTfHWa@EHg~pLVbi- z(ho|S*cVXgI|&^!lf9MG^yol1O=tv!`k}O$nfa6D1QGmo=9^+xjll`pjvKduyn!uz z!0><*0cMCX%IFdU!_6AGsD>BQ$om~l)5v=ghP{r*SeWbwH*Kj$Zc09dZYn7~KCai{ zB2Z795)oyZfNd%$HvB#HYbs{;qhb*BQ`jy$t=vLe+G?s&oW%8@)lN6O5+WfO+)msN zJ%?wg(s0+`;v`u{X6U#8Xk=m?t&yt{+J-_TB_0(bgr~phYzq%P4t?br_E?#jT*x5k z|67!ziF0#a?Z-9pW`Cj?nrn09cv|6IJfQWv(T{mQz?S<7bL6rlM|whQIg){ctA%-^ zsDJ|AxfXf+azOB>OOZlNQ0&f zXIFd?%2}$<(l?u+IeYs2x#wn^NtCp9c30;{C?tpwm5WGn7MZp`N4M=y9zv28?#E;V zUO2s>soj_JgyFv-(&WGs+9aFC#SOK(fYL*_00=5m{Y9ZdHGC(qG5KEH$5|>^x=k_T3%eQskG5<~6D{D7!@&8A6Ru veaa4$Z(CCLVk_aDD7ZvkS-W&Na?>r~@1i{DmR!fR-7%C;O&mdQ(JlNBn(;pK literal 0 HcmV?d00001 diff --git a/services/upload_file/geom_detector/geometry_detector.py b/services/upload_file/utils/geometry_detector.py similarity index 100% rename from services/upload_file/geom_detector/geometry_detector.py rename to services/upload_file/utils/geometry_detector.py diff --git a/services/upload_file/utils/pdf_cleaner.py b/services/upload_file/utils/pdf_cleaner.py new file mode 100644 index 0000000..86d2dd1 --- /dev/null +++ b/services/upload_file/utils/pdf_cleaner.py @@ -0,0 +1,159 @@ +import re +import itertools + +geo_admin_keywords = [ + 'lat', 'lon', 'long', 'latitude', 'longitude', 'koordinat', 'geometry', 'geometri', + 'desa', 'kelurahan', 'kel', 'kecamatan', 'kabupaten', 'kab', 'kota', 'provinsi', + 'lokasi', 'region', 'area', 'zone', 'boundary', 'batas' +] + +def normalize_text(text): + text = text.lower() + text = re.sub(r'[^a-z0-9/ ]+', ' ', text) + text = re.sub(r'\s+', ' ', text).strip() + return text + +def generate_combined_patterns(keywords): + combos = list(itertools.combinations(keywords, 2)) + patterns = [] + for a, b in combos: + patterns.append(rf'{a}\s*/\s*{b}') + patterns.append(rf'{b}\s*/\s*{a}') + return patterns + +combined_patterns = generate_combined_patterns(geo_admin_keywords) + +def contains_geo_admin_keywords(text): + text_clean = normalize_text(text) + if len(text_clean) < 3: + return False + + for pattern in combined_patterns: + if re.search(pattern, text_clean): + return True + + for kw in geo_admin_keywords: + if re.search(rf'(^|[\s/_-]){kw}([\s/_-]|$)', text_clean): + return True + + return False + +def filter_geo_admin_column(tables): + filtered = [] + for table in tables: + found = any(contains_geo_admin_keywords(col) for col in table['columns']) + if found: + filtered.append(table) + return filtered + + +NUMBER_HEADER_KEYWORDS = [ + "no","no.","nomor","nomor urut","no urut","No","Nomor","No Urut","Index", + "ID","Sr No","S/N","SN","Sl No" +] + +def has_number_header(header): + header_text = header + return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS) + +def is_numbering_column(col_values): + numeric_like = 0 + total = 0 + for v in col_values: + if not v or not isinstance(v, str): + continue + total += 1 + if re.fullmatch(r"0*\d{1,3}", v.strip()): + numeric_like += 1 + return total > 0 and (numeric_like / total) > 0.6 + +def is_numeric_value(v): + if v is None: + return False + if isinstance(v, (int, float)): + return True + if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()): + return True + return False + +def cleaning_column(headers, bodies): + cleaned_bodies = [] + + for header, body in zip(headers, bodies): + if not body: + cleaned_bodies.append(body) + continue + + header_has_number = has_number_header(header) + first_col = [row[0] for row in body if row and len(row) > 0] + first_col_is_numbering = is_numbering_column(first_col) + + if not header_has_number and first_col_is_numbering: + new_body = [] + for row in body: + if not row: + continue + first_val = row[0] + if is_numeric_value(first_val) and len(row) > 1: + new_body.append(row[1:]) + else: + new_body.append(row) + body = new_body + + header_len = len(headers) + filtered_body = [row for row in body if len(row) == header_len] + + cleaned_bodies.append(filtered_body) + + return cleaned_bodies + +def parse_page_selection(selectedPage: str, total_pages: int): + if not selectedPage: + return list(range(1, total_pages + 1)) + + pages = set() + parts = re.split(r'[,\s]+', selectedPage.strip()) + + for part in parts: + if '-' in part: + try: + start, end = map(int, part.split('-')) + pages.update(range(start, end + 1)) + except ValueError: + continue + else: + try: + pages.add(int(part)) + except ValueError: + continue + + valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages] + return valid_pages + +def is_number(s): + if s is None: + return False + s = str(s).strip().replace(',', '').replace('.', '') + return s.isdigit() + +def row_ratio(row): + non_empty = [c for c in row if c not in (None, '', ' ')] + if not non_empty: + return 0 + num_count = sum(is_number(c) for c in non_empty) + return num_count / len(non_empty) + +def has_mixed_text_and_numbers(row): + non_empty = [c for c in row if c not in (None, '', ' ')] + has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty) + has_num = any(is_number(c) for c in non_empty) + return has_text and has_num + +def is_short_text_row(row): + """Deteksi baris teks pendek (1-2 kolom teks pendek).""" + non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')] + if not non_empty: + return False + text_only = all(not is_number(c) for c in non_empty) + joined = " ".join(non_empty) + return text_only and len(non_empty) <= 2 and len(joined) < 20 \ No newline at end of file diff --git a/utils/logger_config.py b/utils/logger_config.py new file mode 100644 index 0000000..4d2803a --- /dev/null +++ b/utils/logger_config.py @@ -0,0 +1,32 @@ +import logging +import os + +LOG_DIR = "logs" +os.makedirs(LOG_DIR, exist_ok=True) + +def setup_logger(name: str): + """ + Konfigurasi logger standar untuk seluruh service. + Format log: + [LEVEL] [Nama Modul] Pesan + """ + logger = logging.getLogger(name) + logger.setLevel(logging.INFO) + + # Handler untuk menulis ke file + file_handler = logging.FileHandler(os.path.join(LOG_DIR, "app.log")) + file_handler.setLevel(logging.INFO) + + # Handler untuk console (stdout) + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + + formatter = logging.Formatter('[%(levelname)s] [%(name)s] %(message)s') + file_handler.setFormatter(formatter) + console_handler.setFormatter(formatter) + + if not logger.handlers: + logger.addHandler(file_handler) + logger.addHandler(console_handler) + + return logger diff --git a/utils/qgis_init.py b/utils/qgis_init.py new file mode 100644 index 0000000..9c27cb4 --- /dev/null +++ b/utils/qgis_init.py @@ -0,0 +1,30 @@ +# utils/qgis_init.py +import os +import sys + +# Lokasi instalasi QGIS di Linux (Ubuntu / Debian) +QGIS_PREFIX = "/usr" + +# Path modul Python QGIS +sys.path.append("/usr/share/qgis/python") + +# Environment variable agar QGIS dapat berjalan headless (tanpa GUI) +os.environ["QGIS_PREFIX_PATH"] = QGIS_PREFIX +os.environ["QT_QPA_PLATFORM"] = "offscreen" + +from qgis.core import QgsApplication +from qgis.analysis import QgsNativeAlgorithms +import processing +from processing.core.Processing import Processing + + +def init_qgis(): + qgs = QgsApplication([], False) + qgs.initQgis() + + # Register QGIS processing provider + Processing.initialize() + QgsApplication.processingRegistry().addProvider(QgsNativeAlgorithms()) + + print("QGIS initialized successfully") + return qgs