Init Commit
This commit is contained in:
commit
16f0042508
12
.gitignore
vendored
Normal file
12
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
.env
|
||||
main_old.py
|
||||
sijalinmaja.json
|
||||
|
||||
|
||||
venv/
|
||||
pdf/
|
||||
data_cache/
|
||||
cache/
|
||||
testing/
|
||||
test-ai/
|
||||
uploads/
|
||||
19
core/config.py
Normal file
19
core/config.py
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
|
||||
load_dotenv()
|
||||
|
||||
POSTGIS_URL = os.getenv("POSTGIS_URL")
|
||||
UPLOAD_FOLDER = Path(os.getenv("UPLOAD_FOLDER", "./uploads"))
|
||||
MAX_FILE_MB = int(os.getenv("MAX_FILE_MB", 200))
|
||||
|
||||
REFERENCE_DB_URL = os.getenv("REFERENCE_DB_URL")
|
||||
REFERENCE_SCHEMA = os.getenv("REFERENCE_SCHEMA", "batas_wilayah")
|
||||
REF_COLUMN_MAP = {
|
||||
'desa': 'NAMOBJ',
|
||||
'kecamatan': 'NAMA_KECAMATAN',
|
||||
'kabupaten': 'NAMOBJ'
|
||||
}
|
||||
|
||||
CACHE_FOLDER = Path(os.getenv("CACHE_FOLDER", "./cache"))
|
||||
6
database/connection.py
Normal file
6
database/connection.py
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from core.config import POSTGIS_URL
|
||||
|
||||
engine = create_engine(POSTGIS_URL, pool_pre_ping=True)
|
||||
SessionLocal = sessionmaker(bind=engine)
|
||||
16
database/models.py
Normal file
16
database/models.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
from sqlalchemy import Column, Integer, String, Text, TIMESTAMP
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.sql import func
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
class UploadLog(Base):
|
||||
__tablename__ = "upload_logs"
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
filename = Column(String, nullable=False)
|
||||
table_name = Column(String, nullable=False)
|
||||
file_type = Column(String, nullable=False)
|
||||
rows_count = Column(Integer)
|
||||
uploaded_at = Column(TIMESTAMP, server_default=func.now())
|
||||
status = Column(String)
|
||||
message = Column(Text)
|
||||
16
database/uploader.py
Normal file
16
database/uploader.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
import geopandas as gpd
|
||||
import pandas as pd
|
||||
from database.connection import engine
|
||||
from sqlalchemy import text
|
||||
|
||||
def save_dataframe_dynamic(df: pd.DataFrame, table_name: str):
|
||||
"""Save pandas DataFrame to Postgres (non-geo)."""
|
||||
df.to_sql(table_name, engine, if_exists="replace", index=False, method='multi', chunksize=1000)
|
||||
|
||||
def save_geodataframe(gdf: gpd.GeoDataFrame, table_name: str):
|
||||
"""Save GeoDataFrame to PostGIS (requires geoalchemy/geopandas)."""
|
||||
# ensure geometry column exists and CRS set
|
||||
if gdf.crs is None:
|
||||
gdf = gdf.set_crs("EPSG:4326", allow_override=True)
|
||||
# geopandas >= 0.10 has to_postgis in some installs; fallback using SQLAlchemy + GeoAlchemy2:
|
||||
gdf.to_postgis(table_name, engine, if_exists="replace")
|
||||
3
init_db.py
Normal file
3
init_db.py
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
from database.connection import engine
|
||||
from database.models import Base
|
||||
Base.metadata.create_all(bind=engine)
|
||||
437
main.py
Normal file
437
main.py
Normal file
|
|
@ -0,0 +1,437 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
import numpy as np
|
||||
import zipfile
|
||||
from shapely.geometry.base import BaseGeometry
|
||||
from shapely.geometry import base as shapely_base
|
||||
from fastapi import FastAPI, File, UploadFile, HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
from core.config import UPLOAD_FOLDER, MAX_FILE_MB
|
||||
from services.reader_csv import read_csv
|
||||
from services.reader_shp import read_shp
|
||||
from services.reader_gdb import read_gdb
|
||||
from services.reader_pdf import convert_df, read_pdf
|
||||
from services.geometry_detector import detect_and_build_geometry
|
||||
from services.geometry_detector import attach_polygon_geometry_auto
|
||||
from database.connection import engine
|
||||
from database.models import Base
|
||||
import time
|
||||
import pathlib
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from pydantic import BaseModel
|
||||
from typing import List
|
||||
from shapely import wkt
|
||||
from sqlalchemy import text
|
||||
|
||||
|
||||
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
app = FastAPI(title="ETL Geo Upload Service")
|
||||
|
||||
|
||||
|
||||
origins = [
|
||||
"http://localhost:3000",
|
||||
"http://127.0.0.1:3000",
|
||||
"http://localhost:5173",
|
||||
"http://127.0.0.1:5173",
|
||||
]
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=origins,
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
|
||||
# Create upload_logs table if not exists
|
||||
Base.metadata.create_all(bind=engine)
|
||||
|
||||
def generate_table_name(filename: str, prefix: str = "data"):
|
||||
name = pathlib.Path(filename).stem
|
||||
ts = time.strftime("%Y%m%d%H%M%S")
|
||||
safe = "".join([c if c.isalnum() or c=='_' else '_' for c in name])
|
||||
return f"{prefix}_{safe}_{ts}"
|
||||
|
||||
|
||||
def is_geom_empty(g):
|
||||
if g is None:
|
||||
return True
|
||||
if isinstance(g, float) and pd.isna(g):
|
||||
return True
|
||||
if isinstance(g, BaseGeometry):
|
||||
return g.is_empty
|
||||
return False
|
||||
|
||||
|
||||
def safe_json(value):
|
||||
"""Konversi aman untuk semua tipe numpy/pandas/shapely ke tipe JSON-serializable"""
|
||||
if isinstance(value, (np.int64, np.int32)):
|
||||
return int(value)
|
||||
if isinstance(value, (np.float64, np.float32)):
|
||||
return float(value)
|
||||
if isinstance(value, pd.Timestamp):
|
||||
return value.isoformat()
|
||||
if isinstance(value, shapely_base.BaseGeometry):
|
||||
return str(value) # ubah ke WKT string
|
||||
if pd.isna(value):
|
||||
return None
|
||||
return value
|
||||
|
||||
|
||||
def detect_zip_type(zip_path: str) -> str:
|
||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||
files = zip_ref.namelist()
|
||||
|
||||
if any(f.lower().endswith(".gdb/") or ".gdb/" in f.lower() for f in files):
|
||||
return "gdb"
|
||||
|
||||
if any(f.lower().endswith(ext) for ext in [".gdbtable", ".gdbtablx", ".gdbindexes", ".spx"] for f in files):
|
||||
return "gdb"
|
||||
|
||||
if any(f.lower().endswith(".shp") for f in files):
|
||||
return "shp"
|
||||
|
||||
return "unknown"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@app.post("/upload")
|
||||
async def upload_file(file: UploadFile = File(...)):
|
||||
fname = file.filename
|
||||
ext = os.path.splitext(fname)[1].lower()
|
||||
contents = await file.read()
|
||||
size_mb = len(contents) / (1024*1024)
|
||||
if size_mb > MAX_FILE_MB:
|
||||
raise HTTPException(status_code=413, detail="File too large")
|
||||
tmp_path = UPLOAD_FOLDER / fname
|
||||
with open(tmp_path, "wb") as f:
|
||||
f.write(contents)
|
||||
|
||||
try:
|
||||
df = None
|
||||
|
||||
print('ext', ext)
|
||||
|
||||
if ext == ".csv":
|
||||
df = read_csv(str(tmp_path))
|
||||
elif ext == ".pdf":
|
||||
tbl = read_pdf(tmp_path)
|
||||
if len(tbl) > 1:
|
||||
response = {
|
||||
"message": "File berhasil dibaca dan dianalisis.",
|
||||
"tables": tbl,
|
||||
"file_type": ext
|
||||
}
|
||||
return JSONResponse(content=response)
|
||||
else:
|
||||
df = convert_df(tbl[0])
|
||||
elif ext == ".zip":
|
||||
zip_type = detect_zip_type(str(tmp_path))
|
||||
|
||||
if zip_type == "shp":
|
||||
print("[INFO] ZIP terdeteksi sebagai Shapefile.")
|
||||
df = read_shp(str(tmp_path))
|
||||
|
||||
elif zip_type == "gdb":
|
||||
print("[INFO] ZIP terdeteksi sebagai Geodatabase (GDB).")
|
||||
df = read_gdb(str(tmp_path))
|
||||
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="ZIP file tidak mengandung SHP atau GDB yang valid."
|
||||
)
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="Unsupported file type")
|
||||
|
||||
if df is None or (hasattr(df, "empty") and df.empty):
|
||||
return JSONResponse({"error": "No valid table detected"}, status_code=400)
|
||||
|
||||
result = detect_and_build_geometry(df, master_polygons=None)
|
||||
|
||||
if not hasattr(result, "geometry") or result.geometry.isna().all():
|
||||
result = attach_polygon_geometry_auto(result)
|
||||
|
||||
if isinstance(result, gpd.GeoDataFrame) and "geometry" in result.columns:
|
||||
geom_type = ", ".join([g for g in result.geometry.geom_type.unique() if g]) \
|
||||
if not result.empty else "None"
|
||||
|
||||
null_geom = result.geometry.isna().sum()
|
||||
print(f"[INFO] Tipe Geometry: {geom_type}")
|
||||
print(f"[INFO] Jumlah geometry kosong: {null_geom}")
|
||||
else:
|
||||
response = {
|
||||
"message": "Tidak menemukan tabel yang relevan.",
|
||||
"file_type": ext,
|
||||
"rows": 0,
|
||||
"columns": 0,
|
||||
"geometry_valid": 0,
|
||||
"geometry_empty": 0,
|
||||
"geometry_valid_percent": 0,
|
||||
"warnings": [],
|
||||
"warning_examples": [],
|
||||
"preview": []
|
||||
}
|
||||
|
||||
return JSONResponse(content=response)
|
||||
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
result = result.replace([pd.NA, float('inf'), float('-inf')], None)
|
||||
|
||||
if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns:
|
||||
result['geometry'] = result['geometry'].apply(
|
||||
lambda g: g.wkt if g is not None else None
|
||||
)
|
||||
|
||||
empty_count = result['geometry'].apply(is_geom_empty).sum()
|
||||
valid_count = len(result) - empty_count
|
||||
match_percentage = (valid_count / len(result)) * 100
|
||||
|
||||
warnings = []
|
||||
if empty_count > 0:
|
||||
warnings.append(
|
||||
f"{empty_count} dari {len(result)} baris tidak memiliki geometry yang valid "
|
||||
f"({100 - match_percentage:.2f}% data gagal cocok)."
|
||||
)
|
||||
|
||||
if empty_count > 0:
|
||||
examples = result[result['geometry'].apply(is_geom_empty)].head(500)
|
||||
warning_examples = examples.to_dict(orient="records")
|
||||
else:
|
||||
warning_examples = []
|
||||
|
||||
preview_data = result.to_dict(orient="records")
|
||||
|
||||
preview_safe = [
|
||||
{k: safe_json(v) for k, v in row.items()} for row in preview_data
|
||||
]
|
||||
|
||||
warning_safe = [
|
||||
{k: safe_json(v) for k, v in row.items()} for row in warning_examples
|
||||
]
|
||||
|
||||
response = {
|
||||
"message": "File berhasil dibaca dan dianalisis.",
|
||||
"rows": int(len(result)),
|
||||
"columns": list(map(str, result.columns)),
|
||||
"geometry_valid": int(valid_count),
|
||||
"geometry_empty": int(empty_count),
|
||||
"geometry_valid_percent": float(round(match_percentage, 2)),
|
||||
"warnings": warnings,
|
||||
"warning_examples": warning_safe,
|
||||
"preview": preview_safe
|
||||
}
|
||||
|
||||
return JSONResponse(content=response)
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] {e}")
|
||||
return JSONResponse({"error": str(e)}, status_code=500)
|
||||
|
||||
# finally:
|
||||
# db_session.close()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class PdfRequest(BaseModel):
|
||||
title: str
|
||||
columns: List[str]
|
||||
rows: List[List]
|
||||
|
||||
@app.post("/process-pdf")
|
||||
async def upload_file(payload: PdfRequest):
|
||||
try:
|
||||
df = convert_df(payload.model_dump())
|
||||
if df is None or (hasattr(df, "empty") and df.empty):
|
||||
return JSONResponse({"error": "No valid table detected"}, status_code=400)
|
||||
|
||||
result = detect_and_build_geometry(df, master_polygons=None)
|
||||
|
||||
if not hasattr(result, "geometry") or result.geometry.isna().all():
|
||||
print("[INFO] Mencoba menambahkan geometry (MultiPolygon) berdasarkan nama wilayah...")
|
||||
result = attach_polygon_geometry_auto(result)
|
||||
|
||||
print("\n" + "="*80)
|
||||
|
||||
if isinstance(result, gpd.GeoDataFrame) and "geometry" in result.columns:
|
||||
geom_type = ", ".join([g for g in result.geometry.geom_type.unique() if g]) \
|
||||
if not result.empty else "None"
|
||||
|
||||
null_geom = result.geometry.isna().sum()
|
||||
print(f"[INFO] Tipe Geometry: {geom_type}")
|
||||
print(f"[INFO] Jumlah geometry kosong: {null_geom}")
|
||||
else:
|
||||
print("[WARN] Object bukan GeoDataFrame atau tidak punya kolom geometry.")
|
||||
print(f"[DEBUG] Kolom saat ini: {list(result.columns)}")
|
||||
response = {
|
||||
"message": "Tidak menemukan tabel yang relevan.",
|
||||
"file_type": ".pdf",
|
||||
"rows": 0,
|
||||
"columns": 0,
|
||||
"geometry_valid": 0,
|
||||
"geometry_empty": 0,
|
||||
"geometry_valid_percent": 0,
|
||||
"warnings": [],
|
||||
"warning_examples": [],
|
||||
"preview": []
|
||||
}
|
||||
|
||||
return JSONResponse(content=response)
|
||||
|
||||
result = result.replace([pd.NA, float('inf'), float('-inf')], None)
|
||||
if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns:
|
||||
result['geometry'] = result['geometry'].apply(
|
||||
lambda g: g.wkt if g is not None else None
|
||||
)
|
||||
|
||||
empty_count = result['geometry'].apply(is_geom_empty).sum()
|
||||
valid_count = len(result) - empty_count
|
||||
match_percentage = (valid_count / len(result)) * 100
|
||||
|
||||
warnings = []
|
||||
if empty_count > 0:
|
||||
warnings.append(
|
||||
f"{empty_count} dari {len(result)} baris tidak memiliki geometry yang valid "
|
||||
f"({100 - match_percentage:.2f}% data gagal cocok)."
|
||||
)
|
||||
|
||||
if empty_count > 0:
|
||||
examples = result[result['geometry'].apply(is_geom_empty)].head(500)
|
||||
warning_examples = examples.to_dict(orient="records")
|
||||
else:
|
||||
warning_examples = []
|
||||
|
||||
# preview_data = result.head(5).to_dict(orient="records")
|
||||
preview_data = result.to_dict(orient="records")
|
||||
|
||||
preview_safe = [
|
||||
{k: safe_json(v) for k, v in row.items()} for row in preview_data
|
||||
]
|
||||
|
||||
warning_safe = [
|
||||
{k: safe_json(v) for k, v in row.items()} for row in warning_examples
|
||||
]
|
||||
|
||||
response = {
|
||||
"message": "File berhasil dibaca dan dianalisis.",
|
||||
"rows": int(len(result)),
|
||||
"columns": list(map(str, result.columns)),
|
||||
"geometry_valid": int(valid_count),
|
||||
"geometry_empty": int(empty_count),
|
||||
"geometry_valid_percent": float(round(match_percentage, 2)),
|
||||
"warnings": warnings,
|
||||
"warning_examples": warning_safe,
|
||||
"preview": preview_safe
|
||||
}
|
||||
|
||||
return JSONResponse(content=response)
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] {e}")
|
||||
|
||||
return JSONResponse({"error": str(e)}, status_code=500)
|
||||
|
||||
# finally:
|
||||
# db_session.close()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
VALID_WKT_PREFIXES = (
|
||||
"POINT",
|
||||
"POINT Z",
|
||||
"POINT M",
|
||||
"POINT ZM",
|
||||
"MULTIPOINT",
|
||||
"MULTIPOINT Z",
|
||||
"MULTIPOINT M",
|
||||
"MULTIPOINT ZM",
|
||||
"LINESTRING",
|
||||
"LINESTRING Z",
|
||||
"LINESTRING M",
|
||||
"LINESTRING ZM",
|
||||
"MULTILINESTRING",
|
||||
"MULTILINESTRING Z",
|
||||
"MULTILINESTRING M",
|
||||
"MULTILINESTRING ZM",
|
||||
"POLYGON",
|
||||
"POLYGON Z",
|
||||
"POLYGON M",
|
||||
"POLYGON ZM",
|
||||
"MULTIPOLYGON",
|
||||
"MULTIPOLYGON Z",
|
||||
"MULTIPOLYGON M",
|
||||
"MULTIPOLYGON ZM",
|
||||
"GEOMETRYCOLLECTION",
|
||||
"GEOMETRYCOLLECTION Z",
|
||||
"GEOMETRYCOLLECTION M",
|
||||
"GEOMETRYCOLLECTION ZM",
|
||||
)
|
||||
|
||||
|
||||
class UploadRequest(BaseModel):
|
||||
title: str
|
||||
rows: List[dict]
|
||||
columns: List[str]
|
||||
|
||||
@app.post("/upload_to_postgis")
|
||||
def upload_to_postgis(payload: UploadRequest):
|
||||
try:
|
||||
table_name = payload.title.lower().replace(" ", "_")
|
||||
|
||||
df = pd.DataFrame(payload.rows)
|
||||
print(f"[INFO] Diterima {len(df)} baris data dari frontend.")
|
||||
|
||||
if "geometry" in df.columns:
|
||||
df["geometry"] = df["geometry"].apply(
|
||||
lambda g: wkt.loads(g) if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) else None
|
||||
)
|
||||
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="Kolom geometry tidak ditemukan dalam data.")
|
||||
|
||||
with engine.begin() as conn:
|
||||
conn.execute(text(f"DROP TABLE IF EXISTS {table_name}"))
|
||||
|
||||
gdf.to_postgis(table_name, engine, if_exists="replace", index=False)
|
||||
|
||||
with engine.begin() as conn:
|
||||
conn.execute(text(f'ALTER TABLE "{table_name}" ADD COLUMN _id SERIAL PRIMARY KEY;'))
|
||||
|
||||
print(f"[INFO] Tabel '{table_name}' berhasil dibuat di PostGIS ({len(gdf)} baris).")
|
||||
|
||||
return {
|
||||
"table_name": table_name,
|
||||
"status": "success",
|
||||
"message": f"Tabel '{table_name}' berhasil diunggah ke PostGIS.",
|
||||
"total_rows": len(gdf),
|
||||
"geometry_type": list(gdf.geom_type.unique())
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Gagal upload ke PostGIS: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
|
||||
25
requirements.txt
Normal file
25
requirements.txt
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
fastapi
|
||||
uvicorn[standard]
|
||||
pandas
|
||||
numpy
|
||||
geopandas
|
||||
shapely
|
||||
fiona
|
||||
pyproj
|
||||
SQLAlchemy
|
||||
sqlalchemy
|
||||
psycopg2-binary
|
||||
rapidfuzz
|
||||
pdfplumber
|
||||
zipfile36
|
||||
python-dotenv
|
||||
pydantic
|
||||
python-multipart
|
||||
aiofiles
|
||||
starlette
|
||||
openpyxl
|
||||
requests
|
||||
pathlib
|
||||
|
||||
# --- jika menggunakan ai ---
|
||||
groq
|
||||
376
services/geometry_detector.py
Normal file
376
services/geometry_detector.py
Normal file
|
|
@ -0,0 +1,376 @@
|
|||
import geopandas as gpd
|
||||
from shapely.geometry import Point, LineString
|
||||
import pandas as pd
|
||||
import re
|
||||
from shapely import wkt
|
||||
from rapidfuzz import process, fuzz
|
||||
from sqlalchemy import create_engine
|
||||
from shapely.geometry.base import BaseGeometry
|
||||
from core.config import REFERENCE_DB_URL, REFERENCE_SCHEMA, REF_COLUMN_MAP
|
||||
|
||||
# ============================================================
|
||||
# KONFIGURASI DAN KONSTANTA
|
||||
# ============================================================
|
||||
|
||||
COLUMN_ALIASES = {
|
||||
'desa': ['desa', 'kelurahan', 'desa_kelurahan', 'desa/kelurahan', 'nama_desa', 'nama_kelurahan', 'Desa/Kel'],
|
||||
'kecamatan': ['kec', 'kecamatan', 'nama_kec', 'nama_kecamatan'],
|
||||
'kabupaten': ['kab', 'kabupaten', 'kota', 'kabupaten_kota', 'kota_kabupaten', 'kab/kota', 'kota/kabupaten', 'kota/kab']
|
||||
}
|
||||
|
||||
# ============================================================
|
||||
# FUNGSI BANTU ADMINISTRATIF
|
||||
# ============================================================
|
||||
|
||||
def find_admin_column(df, aliases):
|
||||
"""Mencari kolom yang paling cocok untuk tiap level admin (desa/kec/kab)"""
|
||||
matched = {}
|
||||
for level, alias_list in aliases.items():
|
||||
for col in df.columns:
|
||||
col_norm = col.strip().lower().replace(' ', '_').replace('/', '_')
|
||||
if any(alias in col_norm for alias in alias_list):
|
||||
matched[level] = col
|
||||
break
|
||||
return matched
|
||||
|
||||
|
||||
def detect_smallest_admin_level(df):
|
||||
"""Mendeteksi level administratif terkecil yang ada di DataFrame"""
|
||||
cols = [c.lower() for c in df.columns]
|
||||
if any('desa' in c or 'kelurahan' in c for c in cols):
|
||||
return 'desa'
|
||||
elif any('kecamatan' in c for c in cols):
|
||||
return 'kecamatan'
|
||||
elif any('kab' in c or 'kota' in c for c in cols):
|
||||
return 'kabupaten'
|
||||
return None
|
||||
|
||||
|
||||
def fuzzy_merge(df, master, left_key, right_key, threshold=85):
|
||||
"""Melakukan fuzzy matching antar nama wilayah"""
|
||||
matches = df[left_key].apply(
|
||||
lambda x: process.extractOne(str(x), master[right_key], score_cutoff=threshold)
|
||||
)
|
||||
df['match'] = matches.apply(lambda m: m[0] if m else None)
|
||||
merged = df.merge(master, left_on='match', right_on=right_key, how='left')
|
||||
return merged
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def normalize_name(name: str, level: str = None):
|
||||
if not isinstance(name, str):
|
||||
return None
|
||||
|
||||
name = name.strip()
|
||||
if not name:
|
||||
return None
|
||||
|
||||
raw = name.lower()
|
||||
raw = re.sub(r'^(desa|kelurahan|kel|dusun|kampung)\s+', '', raw)
|
||||
raw = re.sub(r'^(kecamatan|kec)\s+', '', raw)
|
||||
raw = re.sub(r'^(kabupaten|kab\.?|kab)\s+', '', raw)
|
||||
|
||||
if level in ["kabupaten", "kota"]:
|
||||
raw = re.sub(r'^(kota\s+)', '', raw)
|
||||
|
||||
raw = re.sub(r'[^a-z\s]', '', raw)
|
||||
raw = re.sub(r'\s+', ' ', raw).strip()
|
||||
|
||||
tokens = raw.split()
|
||||
|
||||
merged_tokens = []
|
||||
i = 0
|
||||
while i < len(tokens):
|
||||
if i < len(tokens) - 1:
|
||||
sim = fuzz.ratio(tokens[i], tokens[i + 1])
|
||||
if sim > 75:
|
||||
merged_tokens.append(tokens[i] + tokens[i + 1])
|
||||
i += 2
|
||||
continue
|
||||
merged_tokens.append(tokens[i])
|
||||
i += 1
|
||||
|
||||
cleaned_tokens = []
|
||||
prev = None
|
||||
for tok in merged_tokens:
|
||||
if prev and fuzz.ratio(prev, tok) > 95:
|
||||
continue
|
||||
cleaned_tokens.append(tok)
|
||||
prev = tok
|
||||
|
||||
raw = " ".join(cleaned_tokens)
|
||||
formatted = raw.title()
|
||||
|
||||
if level in ["kabupaten", "kota"]:
|
||||
if "kota" in name.lower():
|
||||
if not formatted.startswith("Kota "):
|
||||
formatted = f"Kota {formatted}"
|
||||
else:
|
||||
formatted = formatted.replace("Kota ", "")
|
||||
|
||||
return formatted
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def is_geom_empty(g):
|
||||
"""True jika geometry None, NaN, atau geometry Shapely kosong."""
|
||||
if g is None:
|
||||
return True
|
||||
if isinstance(g, float) and pd.isna(g):
|
||||
return True
|
||||
if isinstance(g, BaseGeometry):
|
||||
return g.is_empty
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# ============================================================
|
||||
# FUNGSI UTAMA GEOMETRY DETECTION (LAT/LON / PATH)
|
||||
# ============================================================
|
||||
def detect_and_build_geometry(df: pd.DataFrame, master_polygons: gpd.GeoDataFrame = None):
|
||||
"""
|
||||
Mendeteksi dan membentuk geometry dari DataFrame.
|
||||
Bisa dari lat/lon, WKT, atau join ke master polygon (jika disediakan).
|
||||
"""
|
||||
|
||||
if isinstance(df, gpd.GeoDataFrame):
|
||||
if "geometry" in df.columns and df.geometry.notna().any():
|
||||
geom_count = df.geometry.notna().sum()
|
||||
geom_type = list(df.geom_type.unique())
|
||||
print(f"[INFO] Detected existing geometry in GeoDataFrame ({geom_count} features, {geom_type}).")
|
||||
return df
|
||||
|
||||
lat_col = next(
|
||||
(c for c in df.columns if re.search(r'\b(lat|latitude|y[_\s]*coord|y$)\b', c.lower())), None
|
||||
)
|
||||
lon_col = next(
|
||||
(c for c in df.columns if re.search(r'\b(lon|long|longitude|x[_\s]*coord|x$)\b', c.lower())), None
|
||||
)
|
||||
|
||||
if lat_col and lon_col:
|
||||
df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
|
||||
df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
|
||||
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326")
|
||||
print("[INFO] Geometry dibangun dari kolom lat/lon.")
|
||||
return gdf
|
||||
|
||||
coord_col = next(
|
||||
(c for c in df.columns if re.search(r'(geom|geometry|wkt|shp|shape|path|coord)', c.lower())), None
|
||||
)
|
||||
|
||||
if coord_col and df[coord_col].notnull().any():
|
||||
sample_val = str(df[coord_col].dropna().iloc[0]).strip()
|
||||
|
||||
if sample_val.startswith('['):
|
||||
def parse_geom(val):
|
||||
try:
|
||||
pts = eval(val)
|
||||
return LineString(pts)
|
||||
except Exception:
|
||||
return None
|
||||
df['geometry'] = df[coord_col].apply(parse_geom)
|
||||
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
|
||||
print("[INFO] Geometry dibangun dari kolom koordinat/path (list of points).")
|
||||
return gdf
|
||||
|
||||
elif any(x in sample_val.upper() for x in ["POINT", "LINESTRING", "POLYGON"]):
|
||||
try:
|
||||
df['geometry'] = df[coord_col].apply(
|
||||
lambda g: wkt.loads(g) if isinstance(g, str) and any(
|
||||
x in g.upper() for x in ["POINT", "LINESTRING", "POLYGON"]
|
||||
) else None
|
||||
)
|
||||
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
|
||||
print("[INFO] Geometry dibangun dari kolom WKT (Point/Line/Polygon/MultiPolygon).")
|
||||
return gdf
|
||||
except Exception as e:
|
||||
print(f"[WARN] Gagal parsing kolom geometry sebagai WKT: {e}")
|
||||
|
||||
|
||||
|
||||
if master_polygons is not None:
|
||||
df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_').str.replace('/', '_')
|
||||
matches = find_admin_column(df, COLUMN_ALIASES)
|
||||
|
||||
if 'desa' in matches:
|
||||
admin_col = matches['desa']
|
||||
merged = df.merge(master_polygons, left_on=admin_col, right_on='nama_desa', how='left')
|
||||
if merged['geometry'].isna().sum() > 0:
|
||||
merged = fuzzy_merge(df, master_polygons, admin_col, 'nama_desa')
|
||||
gdf = gpd.GeoDataFrame(merged, geometry='geometry', crs=master_polygons.crs)
|
||||
return gdf
|
||||
|
||||
elif 'kecamatan' in matches:
|
||||
admin_col = matches['kecamatan']
|
||||
merged = df.merge(master_polygons, left_on=admin_col, right_on='nama_kecamatan', how='left')
|
||||
gdf = gpd.GeoDataFrame(merged, geometry='geometry', crs=master_polygons.crs)
|
||||
return gdf
|
||||
|
||||
elif 'kabupaten' in matches:
|
||||
admin_col = matches['kabupaten']
|
||||
merged = df.merge(master_polygons, left_on=admin_col, right_on='nama_kabupaten', how='left')
|
||||
gdf = gpd.GeoDataFrame(merged, geometry='geometry', crs=master_polygons.crs)
|
||||
return gdf
|
||||
|
||||
print("[WARN] Tidak ditemukan geometry (lat/lon, path, atau master).")
|
||||
return df
|
||||
|
||||
|
||||
def get_reference_polygons(level):
|
||||
"""Mengambil data batas wilayah (MultiPolygon) dari DB referensi"""
|
||||
table_map = {
|
||||
'desa': f"{REFERENCE_SCHEMA}.administrasi_ar_keldesa_jatim",
|
||||
'kecamatan': f"{REFERENCE_SCHEMA}.administrasi_ar_kec_jatim",
|
||||
'kabupaten': f"{REFERENCE_SCHEMA}.administrasi_ar_kabkot_jatim"
|
||||
}
|
||||
|
||||
table_name = table_map.get(level)
|
||||
if not table_name:
|
||||
raise ValueError(f"Tidak ada tabel referensi untuk level '{level}'.")
|
||||
|
||||
engine = create_engine(REFERENCE_DB_URL)
|
||||
query = f"SELECT *, ST_Multi(geom) AS geometry FROM {table_name}"
|
||||
gdf = gpd.read_postgis(query, engine, geom_col='geometry')
|
||||
|
||||
print(f"[INFO] {len(gdf)} data referensi '{level}' berhasil dimuat dari {table_name}.")
|
||||
return gdf
|
||||
|
||||
|
||||
# ============================================================
|
||||
# FUNGSI: AUTO ATTACH POLYGON KE DATAFRAME NON-SPASIAL
|
||||
# ============================================================
|
||||
def attach_polygon_geometry_auto(df: pd.DataFrame):
|
||||
"""
|
||||
Tambahkan kolom geometry MultiPolygon berdasarkan kombinasi
|
||||
(desa/kelurahan + kecamatan + kabupaten/kota), tanpa duplikasi baris.
|
||||
"""
|
||||
level = detect_smallest_admin_level(df)
|
||||
if not level:
|
||||
print("[WARN] Tidak ditemukan kolom administratif (desa/kecamatan/kabupaten).")
|
||||
return df
|
||||
|
||||
print(f"[INFO] Detected smallest admin level: {level}")
|
||||
ref_gdf = get_reference_polygons(level)
|
||||
|
||||
desa_col = next((c for c in df.columns if any(x in c.lower() for x in ['desa', 'kelurahan'])), None)
|
||||
kec_col = next((c for c in df.columns if 'kec' in c.lower()), None)
|
||||
kab_col = next((c for c in df.columns if any(x in c.lower() for x in ['kab', 'kota'])), None)
|
||||
|
||||
if desa_col and (not kec_col or not kab_col):
|
||||
print("[ERROR] Kolom 'Desa' ditemukan tetapi kolom 'Kecamatan' dan/atau 'Kabupaten' tidak lengkap.")
|
||||
print(f"[DEBUG] Ditemukan: Desa={desa_col}, Kec={kec_col}, Kab={kab_col}")
|
||||
return df
|
||||
|
||||
elif not desa_col and kec_col and not kab_col:
|
||||
print("[ERROR] Kolom 'Kecamatan' ditemukan tetapi kolom 'Kabupaten/Kota' tidak ditemukan.")
|
||||
print(f"[DEBUG] Ditemukan: Desa={desa_col}, Kec={kec_col}, Kab={kab_col}")
|
||||
return df
|
||||
|
||||
elif kab_col and not desa_col and not kec_col :
|
||||
print("[INFO] Struktur kolom administratif valid (minimal Kabupaten/Kota ditemukan).")
|
||||
print(f"[DEBUG] Ditemukan: Desa={desa_col}, Kec={kec_col}, Kab={kab_col}")
|
||||
|
||||
elif not desa_col and not kec_col and not kab_col:
|
||||
print("[WARN] Tidak ditemukan kolom administratif apapun (Desa/Kecamatan/Kabupaten).")
|
||||
print(f"[DEBUG] Kolom CSV: {list(df.columns)}")
|
||||
return df
|
||||
|
||||
# kolom di referensi
|
||||
desa_ref = "WADMKD"
|
||||
kec_ref = "WADMKC"
|
||||
kab_ref = "WADMKK"
|
||||
|
||||
if desa_col is not None:
|
||||
df[desa_col] = df[desa_col].astype(str).apply(lambda x: normalize_name(x, "desa"))
|
||||
|
||||
if kec_col is not None:
|
||||
df[kec_col] = df[kec_col].astype(str).apply(lambda x: normalize_name(x, "kecamatan"))
|
||||
|
||||
if kab_col is not None:
|
||||
df[kab_col] = df[kab_col].astype(str).apply(lambda x: normalize_name(x, "kabupaten"))
|
||||
|
||||
|
||||
if desa_ref is not None:
|
||||
ref_gdf[desa_ref] = ref_gdf[desa_ref].astype(str).apply(lambda x: normalize_name(x, "desa"))
|
||||
|
||||
if kec_ref is not None:
|
||||
ref_gdf[kec_ref] = ref_gdf[kec_ref].astype(str).apply(lambda x: normalize_name(x, "kecamatan"))
|
||||
|
||||
if kab_ref is not None:
|
||||
ref_gdf[kab_ref] = ref_gdf[kab_ref].astype(str).apply(lambda x: normalize_name(x, "kabupaten"))
|
||||
|
||||
|
||||
|
||||
|
||||
join_cols = [col for col in [desa_col, kec_col, kab_col] if col]
|
||||
|
||||
if not join_cols:
|
||||
print("[ERROR] Tidak ada kolom administratif yang bisa digunakan untuk join key.")
|
||||
else:
|
||||
join_cols_df = [col for col in [desa_col, kec_col, kab_col] if col]
|
||||
join_cols_ref = [col for col in [desa_ref, kec_ref, kab_ref] if col]
|
||||
|
||||
common_depth = min(len(join_cols_df), len(join_cols_ref))
|
||||
join_cols_df = join_cols_df[-common_depth:]
|
||||
join_cols_ref = join_cols_ref[-common_depth:]
|
||||
|
||||
# print(f"[DEBUG] Join kolom DF : {join_cols_df}")
|
||||
# print(f"[DEBUG] Join kolom REF : {join_cols_ref}")
|
||||
|
||||
df["_join_key"] = df[join_cols_df].astype(str).agg("|".join, axis=1)
|
||||
ref_gdf["_join_key"] = ref_gdf[join_cols_ref].astype(str).agg("|".join, axis=1)
|
||||
|
||||
# print(f"[INFO] Join key berhasil dibuat dari kolom: {join_cols_df}")
|
||||
|
||||
ref_lookup = ref_gdf[["_join_key", "geometry"]].drop_duplicates(subset=["_join_key"])
|
||||
df = df.merge(ref_lookup, how="left", on="_join_key")
|
||||
matched = df["geometry"].notna().sum()
|
||||
# print(f"[INFO] {matched} dari {len(df)} baris cocok langsung berdasarkan (desa + kec + kab/kota).")
|
||||
|
||||
if matched < len(df):
|
||||
unmatched = df[df["geometry"].isna()]
|
||||
# print(f"[INFO] Melakukan fuzzy match untuk {len(unmatched)} baris yang belum cocok...")
|
||||
|
||||
ref_dict = dict(zip(ref_lookup["_join_key"], ref_lookup["geometry"]))
|
||||
|
||||
def find_fuzzy_geom(row):
|
||||
key = row["_join_key"]
|
||||
if not isinstance(key, str):
|
||||
return None
|
||||
# fuzzy old
|
||||
# match = process.extractOne(key, list(ref_dict.keys()), scorer=fuzz.token_sort_ratio)
|
||||
# fuzzy new
|
||||
match = process.extractOne(
|
||||
key, list(ref_dict.keys()), scorer=fuzz.token_set_ratio, score_cutoff=80
|
||||
)
|
||||
|
||||
if match and match[1] >= 85:
|
||||
return ref_dict[match[0]]
|
||||
return None
|
||||
|
||||
df.loc[df["geometry"].isna(), "geometry"] = df[df["geometry"].isna()].apply(find_fuzzy_geom, axis=1)
|
||||
|
||||
df = df.drop(columns=["_join_key"], errors="ignore")
|
||||
|
||||
# admin_cols = [col for col in [desa_col, kec_col, kab_col] if col and col in df.columns]
|
||||
# if matched < len(df):
|
||||
# diff = df[df['geometry'].isna()][admin_cols]
|
||||
|
||||
# print("[DEBUG] Baris yang tidak match:")
|
||||
# if diff.empty:
|
||||
# print("(semua baris berhasil match)")
|
||||
# else:
|
||||
# print(diff.to_string(index=False))
|
||||
|
||||
|
||||
# print(f"[REPORT] Total match: {df['geometry'].notna().sum()} / {len(df)} ({df['geometry'].notna().mean()*100:.2f}%)")
|
||||
|
||||
|
||||
return gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
|
||||
59
services/reader_csv.py
Normal file
59
services/reader_csv.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
# import pandas as pd
|
||||
|
||||
# def read_csv(path: str):
|
||||
# df = pd.read_csv(path)
|
||||
# df.columns = [c.strip() for c in df.columns]
|
||||
|
||||
# return df
|
||||
|
||||
|
||||
# services/reader_csv.py
|
||||
import pandas as pd
|
||||
import re
|
||||
|
||||
def detect_header_line(path, max_rows=10):
|
||||
"""
|
||||
Mendeteksi baris header (nama kolom) di CSV.
|
||||
Mengembalikan index baris header (0-based).
|
||||
"""
|
||||
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
lines = [next(f) for _ in range(max_rows)]
|
||||
|
||||
header_line_idx = 0
|
||||
best_score = -1
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
# Pisahkan berdasarkan koma / titik koma / tab
|
||||
cells = re.split(r'[;,|\t]', line.strip())
|
||||
# Heuristik: jika banyak huruf & sedikit angka → kemungkinan header
|
||||
alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
|
||||
digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
|
||||
score = alpha_ratio - digit_ratio # makin tinggi makin mirip header
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
header_line_idx = i
|
||||
|
||||
return header_line_idx
|
||||
|
||||
|
||||
def read_csv(path: str):
|
||||
"""
|
||||
Membaca CSV dengan deteksi otomatis baris header.
|
||||
"""
|
||||
try:
|
||||
header_line = detect_header_line(path)
|
||||
print(f"[INFO] Detected header line: {header_line + 1}")
|
||||
df = pd.read_csv(path, header=header_line, encoding='utf-8', low_memory=False)
|
||||
except Exception as e:
|
||||
print(f"[WARN] Gagal deteksi header otomatis: {e}, fallback ke baris pertama")
|
||||
df = pd.read_csv(path, encoding='utf-8', low_memory=False)
|
||||
|
||||
# Bersihkan kolom kosong / unnamed
|
||||
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
|
||||
df.columns = [str(c).strip() for c in df.columns]
|
||||
|
||||
# Hapus baris kosong total
|
||||
df = df.dropna(how='all')
|
||||
|
||||
return df
|
||||
75
services/reader_gdb.py
Normal file
75
services/reader_gdb.py
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
import geopandas as gpd
|
||||
import fiona
|
||||
import zipfile
|
||||
import tempfile
|
||||
import os
|
||||
import shutil
|
||||
|
||||
def read_gdb(zip_path: str, layer: str = None):
|
||||
if not zip_path.lower().endswith(".zip"):
|
||||
raise ValueError("File GDB harus berupa ZIP yang berisi folder .gdb atau file .gdbtable")
|
||||
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||
zip_ref.extractall(tmpdir)
|
||||
|
||||
macosx_path = os.path.join(tmpdir, "__MACOSX")
|
||||
if os.path.exists(macosx_path):
|
||||
shutil.rmtree(macosx_path)
|
||||
|
||||
gdb_folders = []
|
||||
for root, dirs, _ in os.walk(tmpdir):
|
||||
for d in dirs:
|
||||
if d.lower().endswith(".gdb"):
|
||||
gdb_folders.append(os.path.join(root, d))
|
||||
|
||||
if not gdb_folders:
|
||||
gdbtable_files = []
|
||||
for root, _, files in os.walk(tmpdir):
|
||||
for f in files:
|
||||
if f.lower().endswith(".gdbtable"):
|
||||
gdbtable_files.append(os.path.join(root, f))
|
||||
|
||||
if gdbtable_files:
|
||||
first_folder = os.path.dirname(gdbtable_files[0])
|
||||
base_name = os.path.basename(first_folder)
|
||||
gdb_folder_path = os.path.join(tmpdir, f"{base_name}.gdb")
|
||||
|
||||
os.makedirs(gdb_folder_path, exist_ok=True)
|
||||
|
||||
for fpath in os.listdir(first_folder):
|
||||
if ".gdb" in fpath.lower():
|
||||
shutil.move(os.path.join(first_folder, fpath), os.path.join(gdb_folder_path, fpath))
|
||||
|
||||
gdb_folders.append(gdb_folder_path)
|
||||
# print(f"[INFO] Rebuilt GDB folder from nested structure: {gdb_folder_path}")
|
||||
else:
|
||||
# print("[DEBUG] Isi ZIP:", os.listdir(tmpdir))
|
||||
shutil.rmtree(tmpdir)
|
||||
raise ValueError("Tidak ditemukan folder .gdb atau file .gdbtable di dalam ZIP")
|
||||
|
||||
gdb_path = gdb_folders[0]
|
||||
|
||||
layers = fiona.listlayers(gdb_path)
|
||||
# print(f"[INFO] Layer tersedia: {layers}")
|
||||
|
||||
chosen_layer = layer or (layers[0] if layers else None)
|
||||
if not chosen_layer:
|
||||
shutil.rmtree(tmpdir)
|
||||
raise ValueError("Tidak ada layer GDB yang bisa dibaca.")
|
||||
|
||||
print(f"[DEBUG] Membaca layer: {chosen_layer}")
|
||||
|
||||
try:
|
||||
gdf = gpd.read_file(gdb_path, layer=chosen_layer)
|
||||
except Exception as e:
|
||||
shutil.rmtree(tmpdir)
|
||||
raise ValueError(f"Gagal membaca layer dari GDB: {e}")
|
||||
|
||||
if gdf.crs is None:
|
||||
# print("[WARN] CRS tidak terdeteksi, diasumsikan EPSG:4326")
|
||||
gdf.set_crs("EPSG:4326", inplace=True)
|
||||
|
||||
|
||||
shutil.rmtree(tmpdir)
|
||||
return gdf
|
||||
250
services/reader_pdf.py
Normal file
250
services/reader_pdf.py
Normal file
|
|
@ -0,0 +1,250 @@
|
|||
import pdfplumber
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
def is_number(s):
|
||||
if s is None:
|
||||
return False
|
||||
s = str(s).strip().replace(',', '').replace('.', '')
|
||||
return s.isdigit()
|
||||
|
||||
def row_ratio(row):
|
||||
non_empty = [c for c in row if c not in (None, '', ' ')]
|
||||
if not non_empty:
|
||||
return 0
|
||||
num_count = sum(is_number(c) for c in non_empty)
|
||||
return num_count / len(non_empty)
|
||||
|
||||
def has_mixed_text_and_numbers(row):
|
||||
non_empty = [c for c in row if c not in (None, '', ' ')]
|
||||
has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty)
|
||||
has_num = any(is_number(c) for c in non_empty)
|
||||
return has_text and has_num
|
||||
|
||||
def is_short_text_row(row):
|
||||
"""Deteksi baris teks pendek (1-2 kolom teks pendek)."""
|
||||
non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')]
|
||||
if not non_empty:
|
||||
return False
|
||||
text_only = all(not is_number(c) for c in non_empty)
|
||||
joined = " ".join(non_empty)
|
||||
return text_only and len(non_empty) <= 2 and len(joined) < 20
|
||||
|
||||
def detect_header_rows(rows):
|
||||
if not rows:
|
||||
return []
|
||||
|
||||
ratios = [row_ratio(r) for r in rows]
|
||||
body_start_index = None
|
||||
|
||||
for i in range(1, len(rows)):
|
||||
row = rows[i]
|
||||
if has_mixed_text_and_numbers(row):
|
||||
body_start_index = i
|
||||
break
|
||||
if ratios[i] > 0.3:
|
||||
body_start_index = i
|
||||
break
|
||||
if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row):
|
||||
body_start_index = i
|
||||
break
|
||||
if ratios[i - 1] == 0 and ratios[i] > 0:
|
||||
body_start_index = i
|
||||
break
|
||||
|
||||
if body_start_index is None:
|
||||
body_start_index = len(rows)
|
||||
|
||||
potential_headers = rows[:body_start_index]
|
||||
body_filtered = rows[body_start_index:]
|
||||
header_filtered = []
|
||||
for idx, row in enumerate(potential_headers):
|
||||
if is_short_text_row(row):
|
||||
if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0:
|
||||
header_filtered.append(row)
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
header_filtered.append(row)
|
||||
|
||||
return header_filtered, body_filtered
|
||||
|
||||
|
||||
def merge_multiline_header(header_rows):
|
||||
final_header = []
|
||||
for col in zip(*header_rows):
|
||||
val = next((v for v in reversed(col) if v and str(v).strip()), '')
|
||||
val = str(val).replace('\n', ' ').strip()
|
||||
final_header.append(val)
|
||||
final_header = [v for v in final_header if v not in ['', None]]
|
||||
|
||||
# header_string = ' | '.join(final_header)
|
||||
# return header_string
|
||||
|
||||
return final_header
|
||||
|
||||
|
||||
|
||||
NUMBER_HEADER_KEYWORDS = ["no","no.","no .","no . ","no :","no : ","nomor","nomor.","nomor :","nomor urut","no urut","no. urut","no-urut","no_urut","nomor_urut","nomor-urut","No","NO","NO.","No.","No :","NO :","Nomor","NOMOR","Nomor Urut","NOMOR URUT","No Urut","NO URUT","No. Urut","NO. URUT","No /","No / ","No / Nama","No -","No - ","Nomor /","Nomor -","Number","No. of","No of","Index","Serial","Order","ID","ID No","ID No.","Sr No","Sr. No","S/N","SN","Sl No","Sl. No","N0","N0.","N0 :","NOM0R","NOM0R URUT","N0MOR",]
|
||||
|
||||
def has_number_header(header):
|
||||
"""Periksa apakah header mengandung kolom No/Nomor."""
|
||||
header_text = header
|
||||
return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS)
|
||||
|
||||
def is_numbering_column(col_values):
|
||||
"""Periksa apakah kolom pertama diisi nomor urut seperti 1, 01, 2, dst."""
|
||||
numeric_like = 0
|
||||
total = 0
|
||||
for v in col_values:
|
||||
if not v or not isinstance(v, str):
|
||||
continue
|
||||
total += 1
|
||||
if re.fullmatch(r"0*\d{1,3}", v.strip()):
|
||||
numeric_like += 1
|
||||
return total > 0 and (numeric_like / total) > 0.6
|
||||
|
||||
def is_numeric_value(v):
|
||||
"""Cek apakah suatu nilai termasuk angka (int, float, atau string angka)."""
|
||||
if v is None:
|
||||
return False
|
||||
if isinstance(v, (int, float)):
|
||||
return True
|
||||
if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()):
|
||||
return True
|
||||
return False
|
||||
|
||||
def cleaning_column(headers, bodies):
|
||||
cleaned_bodies = []
|
||||
|
||||
for header, body in zip(headers, bodies):
|
||||
if not body:
|
||||
cleaned_bodies.append(body)
|
||||
continue
|
||||
|
||||
header_has_number = has_number_header(header)
|
||||
first_col = [row[0] for row in body if row and len(row) > 0]
|
||||
first_col_is_numbering = is_numbering_column(first_col)
|
||||
|
||||
if not header_has_number and first_col_is_numbering:
|
||||
new_body = []
|
||||
for row in body:
|
||||
if not row:
|
||||
continue
|
||||
first_val = row[0]
|
||||
if is_numeric_value(first_val) and len(row) > 1:
|
||||
new_body.append(row[1:])
|
||||
else:
|
||||
new_body.append(row)
|
||||
body = new_body
|
||||
|
||||
header_len = len(headers)
|
||||
filtered_body = [row for row in body if len(row) == header_len]
|
||||
|
||||
cleaned_bodies.append(filtered_body)
|
||||
|
||||
return cleaned_bodies
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def read_pdf(path: str):
|
||||
pdf_path = path
|
||||
tables_data = []
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
page = pdf.pages[0]
|
||||
tables = page.find_tables()
|
||||
for i, t in enumerate(tables, start=1):
|
||||
table = t.extract()
|
||||
if len(table) > 4:
|
||||
tables_data.append(table)
|
||||
|
||||
print(f"\nTotal tabel valid: {len(tables_data)}\n")
|
||||
|
||||
header_only = []
|
||||
body_only = []
|
||||
for tbl in tables_data:
|
||||
head, body = detect_header_rows(tbl)
|
||||
header_only.append(head)
|
||||
body_only.append(body)
|
||||
|
||||
clean_header = []
|
||||
for h in header_only:
|
||||
clean_header.append(merge_multiline_header(h))
|
||||
|
||||
clean_body=[]
|
||||
for i, raw_body in enumerate(body_only):
|
||||
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
|
||||
cleaned = cleaning_column(clean_header[i], [con_body])
|
||||
# clean_body.append(con_body)
|
||||
clean_body.append(cleaned[0])
|
||||
|
||||
# print(clean_header)
|
||||
# print(clean_body)
|
||||
|
||||
parsed = []
|
||||
# for cols, rows in zip(clean_header, clean_body):
|
||||
# parsed.append({
|
||||
# "title": "",
|
||||
# "columns": cols,
|
||||
# "rows": rows
|
||||
# })
|
||||
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
|
||||
parsed.append({
|
||||
"title": str(i), # bisa juga f"Table {i}" kalau mau format tertentu
|
||||
"columns": cols,
|
||||
"rows": rows
|
||||
})
|
||||
|
||||
|
||||
return parsed
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def convert_df(payload):
|
||||
# Validasi dasar
|
||||
print(f'payload {payload}')
|
||||
|
||||
# Cek apakah keys ada
|
||||
if "columns" not in payload or "rows" not in payload:
|
||||
raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.")
|
||||
|
||||
# Pastikan columns dan rows berupa list
|
||||
if not isinstance(payload["columns"], list):
|
||||
raise TypeError("'columns' harus berupa list.")
|
||||
if not isinstance(payload["rows"], list):
|
||||
raise TypeError("'rows' harus berupa list.")
|
||||
|
||||
# Pastikan setiap baris punya jumlah kolom yang sama
|
||||
for i, row in enumerate(payload["rows"]):
|
||||
if len(row) != len(payload["columns"]):
|
||||
raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.")
|
||||
|
||||
# Konversi menjadi DataFrame
|
||||
df = pd.DataFrame(payload["rows"], columns=payload["columns"])
|
||||
|
||||
# Tambahkan atribut title kalau ada
|
||||
if "title" in payload:
|
||||
df.attrs["title"] = payload["title"]
|
||||
|
||||
return df
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def test_read_pdf():
|
||||
# single
|
||||
# parsed = [{'title': 'Tabel 3.49. Potensi Penduduk Terpapar Bencana Banjir di Provinsi Jawa Timur', 'columns': ['No', 'Kabupaten/Kota', 'Jumlah Penduduk Terpapar (Jiwa)', 'Penduduk Umur Rentan', 'Penduduk Miskin', 'Penduduk Disabilitas', 'Kelas'], 'rows': [['1', 'PACITAN', '111.309', '14.142', '9.307', '781', 'SEDANG'], ['2', 'PONOROGO', '381.579', '50.815', '44.256', '2.346', 'SEDANG'], ['3', 'TRENGGALEK', '284.509', '34.304', '33.653', '1.945', 'SEDANG'], ['4', 'TULUNGAGUNG', '777.174', '86.452', '67.952', '3.200', 'SEDANG'], ['5', 'BLITAR', '226.767', '25.032', '22.554', '909', 'SEDANG'], ['6', 'KEDIRI', '545.961', '59.272', '74.578', '2.539', 'SEDANG'], ['7', 'MALANG', '238.170', '23.646', '25.388', '641', 'SEDANG'], ['8', 'LUMAJANG', '267.926', '30.206', '33.738', '970', 'SEDANG'], ['9', 'JEMBER', '1.061.703', '109.355', '105.958', '2.424', 'SEDANG'], ['10', 'BANYUWANGI', '442.290', '51.294', '44.107', '1.168', 'SEDANG'], ['11', 'BONDOWOSO', '143.452', '18.178', '21.676', '517', 'SEDANG'], ['12', 'SITUBONDO', '233.211', '26.799', '54.221', '928', 'SEDANG'], ['13', 'PROBOLINGGO', '326.005', '37.002', '58.562', '1.323', 'SEDANG'], ['14', 'PASURUAN', '485.143', '49.285', '65.076', '1.576', 'SEDANG'], ['15', 'SIDOARJO', '1.930.615', '172.191', '132.673', '3.987', 'SEDANG'], ['16', 'MOJOKERTO', '498.583', '52.453', '49.831', '1.491', 'SEDANG'], ['17', 'JOMBANG', '876.937', '92.415', '107.447', '4.985', 'SEDANG'], ['18', 'NGANJUK', '829.022', '95.454', '117.127', '3.029', 'SEDANG'], ['19', 'MADIUN', '363.763', '44.997', '44.877', '1.695', 'SEDANG'], ['20', 'MAGETAN', '117.247', '15.706', '11.051', '652', 'SEDANG'], ['21', 'NGAWI', '419.065', '49.864', '65.877', '1.572', 'SEDANG'], ['22', 'BOJONEGORO', '910.377', '100.800', '117.977', '3.557', 'SEDANG'], ['23', 'TUBAN', '507.407', '51.775', '60.834', '2.206', 'SEDANG'], ['24', 'LAMONGAN', '884.503', '99.928', '96.031', '3.960', 'SEDANG'], ['25', 'GRESIK', '613.133', '59.848', '49.854', '1.666', 'SEDANG'], ['26', 'BANGKALAN', '312.149', '31.075', '36.099', '1.169', 'SEDANG'], ['27', 'SAMPANG', '239.656', '28.756', '39.790', '1.280', 'SEDANG'], ['28', 'PAMEKASAN', '216.423', '25.831', '30.296', '776', 'SEDANG'], ['29', 'SUMENEP', '217.805', '24.741', '33.293', '1.088', 'SEDANG'], ['1', 'KOTA KEDIRI', '162.064', '17.129', '13.997', '363', 'SEDANG'], ['2', 'KOTA BLITAR', '21.390', '2.242', '1.185', '79', 'SEDANG'], ['3', 'KOTA MALANG', '148.072', '15.499', '6.142', '201', 'SEDANG'], ['4', 'KOTA PROBOLINGGO', '117.911', '12.708', '10.913', '420', 'SEDANG'], ['5', 'KOTA PASURUAN', '199.602', '20.199', '19.721', '516', 'SEDANG'], ['6', 'KOTA MOJOKERTO', '139.962', '14.486', '6.971', '584', 'SEDANG'], ['7', 'KOTA MADIUN', '149.468', '17.255', '6.300', '304', 'SEDANG'], ['8', 'KOTA SURABAYA', '2.469.639', '244.061', '133.953', '3.838', 'SEDANG'], ['9', 'KOTA BATU', '8.858', '939', '529', '13', 'SEDANG'], ['-', 'Provinsi Jawa Timur', '17.878.850', '1.906.134', '1.853.794', '60.698', 'SEDANG']]}]
|
||||
|
||||
# double
|
||||
parsed = [{"title":"Luas Catchment Area (km2) Pada Wilayah Sungai di Provinsi Jawa Timur","columns":["Wilayah Sungai","Luas (km2)","Jumlah DAS"],"rows":[["Bengawan Solo","13.070,00","94 DAS"],["Brantas","13.880,00","20 DAS"],["Welang -Rejoso","2.601,00","36 DAS"],["Pekalen -Sampean","3.953,00","56 DAS"],["Baru -Bajulmati","3.675,00","60 DAS"],["Bondoyudo -Bedadung","5.364,00","47 DAS"],["Madura","4.575,00","173 DAS"]]},{"title":"Jumlah dan Kepadatan Penduduk Menurut Kabupaten\/kota di Provinsi Jawa Timur Tahun 2021","columns":["Kabupaten\/Kota","Jumlah Penduduk","Persentase","Kepadatan Penduduk (Jiwa per Km2)"],"rows":[["Bangkalan","1.082.759","2,64","1.081,20"],["Banyuwangi","1.749.773","4,27","302,60"],["Blitar","1.228.292","3,00","919,05"],["Bojonegoro","1.343.895","3,28","611,20"],["Bondowoso","801.541","1,96","525,27"],["Gresik","1.283.961","3,13","1.077,83"],["Jember","2.581.486","6,30","834,80"],["Jombang","1.350.483","3,29","1.211,10"],["Kediri","1.671.821","4,08","1.206,18"],["Lamongan","1.379.731","3,37","774,24"],["Lumajang","1.091.856","2,66","609,67"],["Madiun","754.263","1,84","726,94"],["Magetan","689.369","1,68","1.000,77"],["Malang","2.611.907","6,37","739,78"],["Mojokerto","1.126.540","2,75","1.569,37"],["Nganjuk","1.133.556","2,77","925,92"],["Ngawi","896.768","2,19","691,96"],["Pacitan","597.580","1,46","429,94"],["Pamekasan","840.790","2,05","1.061,28"],["Pasuruan","1.603.754","3,91","1.088,01"],["Ponorogo","968.681","2,36","741,89"],["Probolinggo","1.156.570","2,82","681,86"],["Sampang","902.514","2,20","731,92"],["Sidoarjo","1.951.723","4,76","3.076,58"],["Situbondo","666.245","1,63","398,98"],["Sumenep","1.134.750","2,77","567,79"],["Trenggalek","746.734","1,82","650,91"],["Tuban","1.223.257","2,98","666,93"],["Tulungagung","1.126.679","2,75","1.067,28"],["Kota Batu","215.248","0,53","1.574,14"],["Kota Blitar","158.123","0,39","4.854,87"],["Kota Kediri","292.363","0,71","4.611,40"],["Kota Madiun","201.243","0,49","6.045,15"],["Kota Malang","866.356","2,11","5.963,35"],["Kota Mojokerto","139.961","0,34","8.497,94"],["Kota Pasuruan","210.341","0,51","5.960,36"],["Kota Probolinggo","242.246","0,59","4.274,68"],["Kota Surabaya","2.970.843","7,25","8.475,05"],["Provinsi Jawa Timur","40.994.002","100,00","76.228,17"]]}]
|
||||
# df = convert_df(parsed, table_index=0)
|
||||
return parsed
|
||||
60
services/reader_shp.py
Normal file
60
services/reader_shp.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
import geopandas as gpd
|
||||
import fiona
|
||||
import zipfile
|
||||
import tempfile
|
||||
import os
|
||||
import shutil
|
||||
from shapely.geometry import shape
|
||||
|
||||
def read_shp(path: str):
|
||||
if not path:
|
||||
raise ValueError("Path shapefile tidak boleh kosong.")
|
||||
|
||||
tmpdir = None
|
||||
shp_path = None
|
||||
|
||||
if path.lower().endswith(".zip"):
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
with zipfile.ZipFile(path, "r") as zip_ref:
|
||||
zip_ref.extractall(tmpdir)
|
||||
|
||||
shp_files = []
|
||||
for root, _, files in os.walk(tmpdir):
|
||||
for f in files:
|
||||
if f.lower().endswith(".shp"):
|
||||
shp_files.append(os.path.join(root, f))
|
||||
|
||||
if not shp_files:
|
||||
raise ValueError("Tidak ditemukan file .shp di dalam ZIP.")
|
||||
shp_path = shp_files[0]
|
||||
print(f"[DEBUG] Membaca shapefile: {os.path.basename(shp_path)}")
|
||||
|
||||
else:
|
||||
shp_path = path
|
||||
|
||||
try:
|
||||
gdf = gpd.read_file(shp_path)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Gagal membaca shapefile: {e}")
|
||||
|
||||
if "geometry" not in gdf.columns or gdf.geometry.is_empty.all():
|
||||
print("[WARN] Geometry kosong. Mencoba membangun ulang dari fitur mentah...")
|
||||
|
||||
with fiona.open(shp_path) as src:
|
||||
features = []
|
||||
for feat in src:
|
||||
geom = shape(feat["geometry"]) if feat["geometry"] else None
|
||||
props = feat["properties"]
|
||||
props["geometry"] = geom
|
||||
features.append(props)
|
||||
|
||||
gdf = gpd.GeoDataFrame(features, geometry="geometry", crs=src.crs)
|
||||
|
||||
if gdf.crs is None:
|
||||
# print("[WARN] CRS tidak terdeteksi. Diasumsikan EPSG:4326")
|
||||
gdf.set_crs("EPSG:4326", inplace=True)
|
||||
|
||||
if tmpdir and os.path.exists(tmpdir):
|
||||
shutil.rmtree(tmpdir)
|
||||
|
||||
return gdf
|
||||
Loading…
Reference in New Issue
Block a user