commit 16f00425082363f08bae1662dd3080c83aa2968e Author: dmsanhrProject Date: Wed Oct 29 17:07:48 2025 +0700 Init Commit diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..23ffbc0 Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..17c1fa7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +.env +main_old.py +sijalinmaja.json + + +venv/ +pdf/ +data_cache/ +cache/ +testing/ +test-ai/ +uploads/ diff --git a/core/config.py b/core/config.py new file mode 100644 index 0000000..65999ba --- /dev/null +++ b/core/config.py @@ -0,0 +1,19 @@ +from pathlib import Path +from dotenv import load_dotenv +import os + +load_dotenv() + +POSTGIS_URL = os.getenv("POSTGIS_URL") +UPLOAD_FOLDER = Path(os.getenv("UPLOAD_FOLDER", "./uploads")) +MAX_FILE_MB = int(os.getenv("MAX_FILE_MB", 200)) + +REFERENCE_DB_URL = os.getenv("REFERENCE_DB_URL") +REFERENCE_SCHEMA = os.getenv("REFERENCE_SCHEMA", "batas_wilayah") +REF_COLUMN_MAP = { + 'desa': 'NAMOBJ', + 'kecamatan': 'NAMA_KECAMATAN', + 'kabupaten': 'NAMOBJ' +} + +CACHE_FOLDER = Path(os.getenv("CACHE_FOLDER", "./cache")) \ No newline at end of file diff --git a/database/connection.py b/database/connection.py new file mode 100644 index 0000000..6903193 --- /dev/null +++ b/database/connection.py @@ -0,0 +1,6 @@ +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from core.config import POSTGIS_URL + +engine = create_engine(POSTGIS_URL, pool_pre_ping=True) +SessionLocal = sessionmaker(bind=engine) diff --git a/database/models.py b/database/models.py new file mode 100644 index 0000000..f848160 --- /dev/null +++ b/database/models.py @@ -0,0 +1,16 @@ +from sqlalchemy import Column, Integer, String, Text, TIMESTAMP +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.sql import func + +Base = declarative_base() + +class UploadLog(Base): + __tablename__ = "upload_logs" + id = Column(Integer, primary_key=True, index=True) + filename = Column(String, nullable=False) + table_name = Column(String, nullable=False) + file_type = Column(String, nullable=False) + rows_count = Column(Integer) + uploaded_at = Column(TIMESTAMP, server_default=func.now()) + status = Column(String) + message = Column(Text) diff --git a/database/uploader.py b/database/uploader.py new file mode 100644 index 0000000..da164ac --- /dev/null +++ b/database/uploader.py @@ -0,0 +1,16 @@ +import geopandas as gpd +import pandas as pd +from database.connection import engine +from sqlalchemy import text + +def save_dataframe_dynamic(df: pd.DataFrame, table_name: str): + """Save pandas DataFrame to Postgres (non-geo).""" + df.to_sql(table_name, engine, if_exists="replace", index=False, method='multi', chunksize=1000) + +def save_geodataframe(gdf: gpd.GeoDataFrame, table_name: str): + """Save GeoDataFrame to PostGIS (requires geoalchemy/geopandas).""" + # ensure geometry column exists and CRS set + if gdf.crs is None: + gdf = gdf.set_crs("EPSG:4326", allow_override=True) + # geopandas >= 0.10 has to_postgis in some installs; fallback using SQLAlchemy + GeoAlchemy2: + gdf.to_postgis(table_name, engine, if_exists="replace") diff --git a/init_db.py b/init_db.py new file mode 100644 index 0000000..0702c1c --- /dev/null +++ b/init_db.py @@ -0,0 +1,3 @@ +from database.connection import engine +from database.models import Base +Base.metadata.create_all(bind=engine) diff --git a/main.py b/main.py new file mode 100644 index 0000000..c034f6b --- /dev/null +++ b/main.py @@ -0,0 +1,437 @@ +import os +import pandas as pd +import geopandas as gpd +import numpy as np +import zipfile +from shapely.geometry.base import BaseGeometry +from shapely.geometry import base as shapely_base +from fastapi import FastAPI, File, UploadFile, HTTPException +from fastapi.responses import JSONResponse +from core.config import UPLOAD_FOLDER, MAX_FILE_MB +from services.reader_csv import read_csv +from services.reader_shp import read_shp +from services.reader_gdb import read_gdb +from services.reader_pdf import convert_df, read_pdf +from services.geometry_detector import detect_and_build_geometry +from services.geometry_detector import attach_polygon_geometry_auto +from database.connection import engine +from database.models import Base +import time +import pathlib +from fastapi.middleware.cors import CORSMiddleware + +from pydantic import BaseModel +from typing import List +from shapely import wkt +from sqlalchemy import text + + +UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True) + +app = FastAPI(title="ETL Geo Upload Service") + + + +origins = [ + "http://localhost:3000", + "http://127.0.0.1:3000", + "http://localhost:5173", + "http://127.0.0.1:5173", +] + +app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + + +# Create upload_logs table if not exists +Base.metadata.create_all(bind=engine) + +def generate_table_name(filename: str, prefix: str = "data"): + name = pathlib.Path(filename).stem + ts = time.strftime("%Y%m%d%H%M%S") + safe = "".join([c if c.isalnum() or c=='_' else '_' for c in name]) + return f"{prefix}_{safe}_{ts}" + + +def is_geom_empty(g): + if g is None: + return True + if isinstance(g, float) and pd.isna(g): + return True + if isinstance(g, BaseGeometry): + return g.is_empty + return False + + +def safe_json(value): + """Konversi aman untuk semua tipe numpy/pandas/shapely ke tipe JSON-serializable""" + if isinstance(value, (np.int64, np.int32)): + return int(value) + if isinstance(value, (np.float64, np.float32)): + return float(value) + if isinstance(value, pd.Timestamp): + return value.isoformat() + if isinstance(value, shapely_base.BaseGeometry): + return str(value) # ubah ke WKT string + if pd.isna(value): + return None + return value + + +def detect_zip_type(zip_path: str) -> str: + with zipfile.ZipFile(zip_path, "r") as zip_ref: + files = zip_ref.namelist() + + if any(f.lower().endswith(".gdb/") or ".gdb/" in f.lower() for f in files): + return "gdb" + + if any(f.lower().endswith(ext) for ext in [".gdbtable", ".gdbtablx", ".gdbindexes", ".spx"] for f in files): + return "gdb" + + if any(f.lower().endswith(".shp") for f in files): + return "shp" + + return "unknown" + + + + + + +@app.post("/upload") +async def upload_file(file: UploadFile = File(...)): + fname = file.filename + ext = os.path.splitext(fname)[1].lower() + contents = await file.read() + size_mb = len(contents) / (1024*1024) + if size_mb > MAX_FILE_MB: + raise HTTPException(status_code=413, detail="File too large") + tmp_path = UPLOAD_FOLDER / fname + with open(tmp_path, "wb") as f: + f.write(contents) + + try: + df = None + + print('ext', ext) + + if ext == ".csv": + df = read_csv(str(tmp_path)) + elif ext == ".pdf": + tbl = read_pdf(tmp_path) + if len(tbl) > 1: + response = { + "message": "File berhasil dibaca dan dianalisis.", + "tables": tbl, + "file_type": ext + } + return JSONResponse(content=response) + else: + df = convert_df(tbl[0]) + elif ext == ".zip": + zip_type = detect_zip_type(str(tmp_path)) + + if zip_type == "shp": + print("[INFO] ZIP terdeteksi sebagai Shapefile.") + df = read_shp(str(tmp_path)) + + elif zip_type == "gdb": + print("[INFO] ZIP terdeteksi sebagai Geodatabase (GDB).") + df = read_gdb(str(tmp_path)) + + else: + raise HTTPException( + status_code=400, + detail="ZIP file tidak mengandung SHP atau GDB yang valid." + ) + else: + raise HTTPException(status_code=400, detail="Unsupported file type") + + if df is None or (hasattr(df, "empty") and df.empty): + return JSONResponse({"error": "No valid table detected"}, status_code=400) + + result = detect_and_build_geometry(df, master_polygons=None) + + if not hasattr(result, "geometry") or result.geometry.isna().all(): + result = attach_polygon_geometry_auto(result) + + if isinstance(result, gpd.GeoDataFrame) and "geometry" in result.columns: + geom_type = ", ".join([g for g in result.geometry.geom_type.unique() if g]) \ + if not result.empty else "None" + + null_geom = result.geometry.isna().sum() + print(f"[INFO] Tipe Geometry: {geom_type}") + print(f"[INFO] Jumlah geometry kosong: {null_geom}") + else: + response = { + "message": "Tidak menemukan tabel yang relevan.", + "file_type": ext, + "rows": 0, + "columns": 0, + "geometry_valid": 0, + "geometry_empty": 0, + "geometry_valid_percent": 0, + "warnings": [], + "warning_examples": [], + "preview": [] + } + + return JSONResponse(content=response) + + tmp_path.unlink(missing_ok=True) + + result = result.replace([pd.NA, float('inf'), float('-inf')], None) + + if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns: + result['geometry'] = result['geometry'].apply( + lambda g: g.wkt if g is not None else None + ) + + empty_count = result['geometry'].apply(is_geom_empty).sum() + valid_count = len(result) - empty_count + match_percentage = (valid_count / len(result)) * 100 + + warnings = [] + if empty_count > 0: + warnings.append( + f"{empty_count} dari {len(result)} baris tidak memiliki geometry yang valid " + f"({100 - match_percentage:.2f}% data gagal cocok)." + ) + + if empty_count > 0: + examples = result[result['geometry'].apply(is_geom_empty)].head(500) + warning_examples = examples.to_dict(orient="records") + else: + warning_examples = [] + + preview_data = result.to_dict(orient="records") + + preview_safe = [ + {k: safe_json(v) for k, v in row.items()} for row in preview_data + ] + + warning_safe = [ + {k: safe_json(v) for k, v in row.items()} for row in warning_examples + ] + + response = { + "message": "File berhasil dibaca dan dianalisis.", + "rows": int(len(result)), + "columns": list(map(str, result.columns)), + "geometry_valid": int(valid_count), + "geometry_empty": int(empty_count), + "geometry_valid_percent": float(round(match_percentage, 2)), + "warnings": warnings, + "warning_examples": warning_safe, + "preview": preview_safe + } + + return JSONResponse(content=response) + + except Exception as e: + print(f"[ERROR] {e}") + return JSONResponse({"error": str(e)}, status_code=500) + + # finally: + # db_session.close() + + + + + + + + +class PdfRequest(BaseModel): + title: str + columns: List[str] + rows: List[List] + +@app.post("/process-pdf") +async def upload_file(payload: PdfRequest): + try: + df = convert_df(payload.model_dump()) + if df is None or (hasattr(df, "empty") and df.empty): + return JSONResponse({"error": "No valid table detected"}, status_code=400) + + result = detect_and_build_geometry(df, master_polygons=None) + + if not hasattr(result, "geometry") or result.geometry.isna().all(): + print("[INFO] Mencoba menambahkan geometry (MultiPolygon) berdasarkan nama wilayah...") + result = attach_polygon_geometry_auto(result) + + print("\n" + "="*80) + + if isinstance(result, gpd.GeoDataFrame) and "geometry" in result.columns: + geom_type = ", ".join([g for g in result.geometry.geom_type.unique() if g]) \ + if not result.empty else "None" + + null_geom = result.geometry.isna().sum() + print(f"[INFO] Tipe Geometry: {geom_type}") + print(f"[INFO] Jumlah geometry kosong: {null_geom}") + else: + print("[WARN] Object bukan GeoDataFrame atau tidak punya kolom geometry.") + print(f"[DEBUG] Kolom saat ini: {list(result.columns)}") + response = { + "message": "Tidak menemukan tabel yang relevan.", + "file_type": ".pdf", + "rows": 0, + "columns": 0, + "geometry_valid": 0, + "geometry_empty": 0, + "geometry_valid_percent": 0, + "warnings": [], + "warning_examples": [], + "preview": [] + } + + return JSONResponse(content=response) + + result = result.replace([pd.NA, float('inf'), float('-inf')], None) + if isinstance(result, gpd.GeoDataFrame) and 'geometry' in result.columns: + result['geometry'] = result['geometry'].apply( + lambda g: g.wkt if g is not None else None + ) + + empty_count = result['geometry'].apply(is_geom_empty).sum() + valid_count = len(result) - empty_count + match_percentage = (valid_count / len(result)) * 100 + + warnings = [] + if empty_count > 0: + warnings.append( + f"{empty_count} dari {len(result)} baris tidak memiliki geometry yang valid " + f"({100 - match_percentage:.2f}% data gagal cocok)." + ) + + if empty_count > 0: + examples = result[result['geometry'].apply(is_geom_empty)].head(500) + warning_examples = examples.to_dict(orient="records") + else: + warning_examples = [] + + # preview_data = result.head(5).to_dict(orient="records") + preview_data = result.to_dict(orient="records") + + preview_safe = [ + {k: safe_json(v) for k, v in row.items()} for row in preview_data + ] + + warning_safe = [ + {k: safe_json(v) for k, v in row.items()} for row in warning_examples + ] + + response = { + "message": "File berhasil dibaca dan dianalisis.", + "rows": int(len(result)), + "columns": list(map(str, result.columns)), + "geometry_valid": int(valid_count), + "geometry_empty": int(empty_count), + "geometry_valid_percent": float(round(match_percentage, 2)), + "warnings": warnings, + "warning_examples": warning_safe, + "preview": preview_safe + } + + return JSONResponse(content=response) + + except Exception as e: + print(f"[ERROR] {e}") + + return JSONResponse({"error": str(e)}, status_code=500) + + # finally: + # db_session.close() + + + + + + + + + + +VALID_WKT_PREFIXES = ( + "POINT", + "POINT Z", + "POINT M", + "POINT ZM", + "MULTIPOINT", + "MULTIPOINT Z", + "MULTIPOINT M", + "MULTIPOINT ZM", + "LINESTRING", + "LINESTRING Z", + "LINESTRING M", + "LINESTRING ZM", + "MULTILINESTRING", + "MULTILINESTRING Z", + "MULTILINESTRING M", + "MULTILINESTRING ZM", + "POLYGON", + "POLYGON Z", + "POLYGON M", + "POLYGON ZM", + "MULTIPOLYGON", + "MULTIPOLYGON Z", + "MULTIPOLYGON M", + "MULTIPOLYGON ZM", + "GEOMETRYCOLLECTION", + "GEOMETRYCOLLECTION Z", + "GEOMETRYCOLLECTION M", + "GEOMETRYCOLLECTION ZM", +) + + +class UploadRequest(BaseModel): + title: str + rows: List[dict] + columns: List[str] + +@app.post("/upload_to_postgis") +def upload_to_postgis(payload: UploadRequest): + try: + table_name = payload.title.lower().replace(" ", "_") + + df = pd.DataFrame(payload.rows) + print(f"[INFO] Diterima {len(df)} baris data dari frontend.") + + if "geometry" in df.columns: + df["geometry"] = df["geometry"].apply( + lambda g: wkt.loads(g) if isinstance(g, str) and g.strip().upper().startswith(VALID_WKT_PREFIXES) else None + ) + gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") + else: + raise HTTPException(status_code=400, detail="Kolom geometry tidak ditemukan dalam data.") + + with engine.begin() as conn: + conn.execute(text(f"DROP TABLE IF EXISTS {table_name}")) + + gdf.to_postgis(table_name, engine, if_exists="replace", index=False) + + with engine.begin() as conn: + conn.execute(text(f'ALTER TABLE "{table_name}" ADD COLUMN _id SERIAL PRIMARY KEY;')) + + print(f"[INFO] Tabel '{table_name}' berhasil dibuat di PostGIS ({len(gdf)} baris).") + + return { + "table_name": table_name, + "status": "success", + "message": f"Tabel '{table_name}' berhasil diunggah ke PostGIS.", + "total_rows": len(gdf), + "geometry_type": list(gdf.geom_type.unique()) + } + + except Exception as e: + print(f"[ERROR] Gagal upload ke PostGIS: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9b2c3df --- /dev/null +++ b/requirements.txt @@ -0,0 +1,25 @@ +fastapi +uvicorn[standard] +pandas +numpy +geopandas +shapely +fiona +pyproj +SQLAlchemy +sqlalchemy +psycopg2-binary +rapidfuzz +pdfplumber +zipfile36 +python-dotenv +pydantic +python-multipart +aiofiles +starlette +openpyxl +requests +pathlib + +# --- jika menggunakan ai --- +groq diff --git a/services/geometry_detector.py b/services/geometry_detector.py new file mode 100644 index 0000000..4b1e7a6 --- /dev/null +++ b/services/geometry_detector.py @@ -0,0 +1,376 @@ +import geopandas as gpd +from shapely.geometry import Point, LineString +import pandas as pd +import re +from shapely import wkt +from rapidfuzz import process, fuzz +from sqlalchemy import create_engine +from shapely.geometry.base import BaseGeometry +from core.config import REFERENCE_DB_URL, REFERENCE_SCHEMA, REF_COLUMN_MAP + +# ============================================================ +# KONFIGURASI DAN KONSTANTA +# ============================================================ + +COLUMN_ALIASES = { + 'desa': ['desa', 'kelurahan', 'desa_kelurahan', 'desa/kelurahan', 'nama_desa', 'nama_kelurahan', 'Desa/Kel'], + 'kecamatan': ['kec', 'kecamatan', 'nama_kec', 'nama_kecamatan'], + 'kabupaten': ['kab', 'kabupaten', 'kota', 'kabupaten_kota', 'kota_kabupaten', 'kab/kota', 'kota/kabupaten', 'kota/kab'] +} + +# ============================================================ +# FUNGSI BANTU ADMINISTRATIF +# ============================================================ + +def find_admin_column(df, aliases): + """Mencari kolom yang paling cocok untuk tiap level admin (desa/kec/kab)""" + matched = {} + for level, alias_list in aliases.items(): + for col in df.columns: + col_norm = col.strip().lower().replace(' ', '_').replace('/', '_') + if any(alias in col_norm for alias in alias_list): + matched[level] = col + break + return matched + + +def detect_smallest_admin_level(df): + """Mendeteksi level administratif terkecil yang ada di DataFrame""" + cols = [c.lower() for c in df.columns] + if any('desa' in c or 'kelurahan' in c for c in cols): + return 'desa' + elif any('kecamatan' in c for c in cols): + return 'kecamatan' + elif any('kab' in c or 'kota' in c for c in cols): + return 'kabupaten' + return None + + +def fuzzy_merge(df, master, left_key, right_key, threshold=85): + """Melakukan fuzzy matching antar nama wilayah""" + matches = df[left_key].apply( + lambda x: process.extractOne(str(x), master[right_key], score_cutoff=threshold) + ) + df['match'] = matches.apply(lambda m: m[0] if m else None) + merged = df.merge(master, left_on='match', right_on=right_key, how='left') + return merged + + + + + +def normalize_name(name: str, level: str = None): + if not isinstance(name, str): + return None + + name = name.strip() + if not name: + return None + + raw = name.lower() + raw = re.sub(r'^(desa|kelurahan|kel|dusun|kampung)\s+', '', raw) + raw = re.sub(r'^(kecamatan|kec)\s+', '', raw) + raw = re.sub(r'^(kabupaten|kab\.?|kab)\s+', '', raw) + + if level in ["kabupaten", "kota"]: + raw = re.sub(r'^(kota\s+)', '', raw) + + raw = re.sub(r'[^a-z\s]', '', raw) + raw = re.sub(r'\s+', ' ', raw).strip() + + tokens = raw.split() + + merged_tokens = [] + i = 0 + while i < len(tokens): + if i < len(tokens) - 1: + sim = fuzz.ratio(tokens[i], tokens[i + 1]) + if sim > 75: + merged_tokens.append(tokens[i] + tokens[i + 1]) + i += 2 + continue + merged_tokens.append(tokens[i]) + i += 1 + + cleaned_tokens = [] + prev = None + for tok in merged_tokens: + if prev and fuzz.ratio(prev, tok) > 95: + continue + cleaned_tokens.append(tok) + prev = tok + + raw = " ".join(cleaned_tokens) + formatted = raw.title() + + if level in ["kabupaten", "kota"]: + if "kota" in name.lower(): + if not formatted.startswith("Kota "): + formatted = f"Kota {formatted}" + else: + formatted = formatted.replace("Kota ", "") + + return formatted + + + + + +def is_geom_empty(g): + """True jika geometry None, NaN, atau geometry Shapely kosong.""" + if g is None: + return True + if isinstance(g, float) and pd.isna(g): + return True + if isinstance(g, BaseGeometry): + return g.is_empty + return False + + + + + + +# ============================================================ +# FUNGSI UTAMA GEOMETRY DETECTION (LAT/LON / PATH) +# ============================================================ +def detect_and_build_geometry(df: pd.DataFrame, master_polygons: gpd.GeoDataFrame = None): + """ + Mendeteksi dan membentuk geometry dari DataFrame. + Bisa dari lat/lon, WKT, atau join ke master polygon (jika disediakan). + """ + + if isinstance(df, gpd.GeoDataFrame): + if "geometry" in df.columns and df.geometry.notna().any(): + geom_count = df.geometry.notna().sum() + geom_type = list(df.geom_type.unique()) + print(f"[INFO] Detected existing geometry in GeoDataFrame ({geom_count} features, {geom_type}).") + return df + + lat_col = next( + (c for c in df.columns if re.search(r'\b(lat|latitude|y[_\s]*coord|y$)\b', c.lower())), None + ) + lon_col = next( + (c for c in df.columns if re.search(r'\b(lon|long|longitude|x[_\s]*coord|x$)\b', c.lower())), None + ) + + if lat_col and lon_col: + df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce') + df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce') + gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326") + print("[INFO] Geometry dibangun dari kolom lat/lon.") + return gdf + + coord_col = next( + (c for c in df.columns if re.search(r'(geom|geometry|wkt|shp|shape|path|coord)', c.lower())), None + ) + + if coord_col and df[coord_col].notnull().any(): + sample_val = str(df[coord_col].dropna().iloc[0]).strip() + + if sample_val.startswith('['): + def parse_geom(val): + try: + pts = eval(val) + return LineString(pts) + except Exception: + return None + df['geometry'] = df[coord_col].apply(parse_geom) + gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326") + print("[INFO] Geometry dibangun dari kolom koordinat/path (list of points).") + return gdf + + elif any(x in sample_val.upper() for x in ["POINT", "LINESTRING", "POLYGON"]): + try: + df['geometry'] = df[coord_col].apply( + lambda g: wkt.loads(g) if isinstance(g, str) and any( + x in g.upper() for x in ["POINT", "LINESTRING", "POLYGON"] + ) else None + ) + gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326") + print("[INFO] Geometry dibangun dari kolom WKT (Point/Line/Polygon/MultiPolygon).") + return gdf + except Exception as e: + print(f"[WARN] Gagal parsing kolom geometry sebagai WKT: {e}") + + + + if master_polygons is not None: + df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_').str.replace('/', '_') + matches = find_admin_column(df, COLUMN_ALIASES) + + if 'desa' in matches: + admin_col = matches['desa'] + merged = df.merge(master_polygons, left_on=admin_col, right_on='nama_desa', how='left') + if merged['geometry'].isna().sum() > 0: + merged = fuzzy_merge(df, master_polygons, admin_col, 'nama_desa') + gdf = gpd.GeoDataFrame(merged, geometry='geometry', crs=master_polygons.crs) + return gdf + + elif 'kecamatan' in matches: + admin_col = matches['kecamatan'] + merged = df.merge(master_polygons, left_on=admin_col, right_on='nama_kecamatan', how='left') + gdf = gpd.GeoDataFrame(merged, geometry='geometry', crs=master_polygons.crs) + return gdf + + elif 'kabupaten' in matches: + admin_col = matches['kabupaten'] + merged = df.merge(master_polygons, left_on=admin_col, right_on='nama_kabupaten', how='left') + gdf = gpd.GeoDataFrame(merged, geometry='geometry', crs=master_polygons.crs) + return gdf + + print("[WARN] Tidak ditemukan geometry (lat/lon, path, atau master).") + return df + + +def get_reference_polygons(level): + """Mengambil data batas wilayah (MultiPolygon) dari DB referensi""" + table_map = { + 'desa': f"{REFERENCE_SCHEMA}.administrasi_ar_keldesa_jatim", + 'kecamatan': f"{REFERENCE_SCHEMA}.administrasi_ar_kec_jatim", + 'kabupaten': f"{REFERENCE_SCHEMA}.administrasi_ar_kabkot_jatim" + } + + table_name = table_map.get(level) + if not table_name: + raise ValueError(f"Tidak ada tabel referensi untuk level '{level}'.") + + engine = create_engine(REFERENCE_DB_URL) + query = f"SELECT *, ST_Multi(geom) AS geometry FROM {table_name}" + gdf = gpd.read_postgis(query, engine, geom_col='geometry') + + print(f"[INFO] {len(gdf)} data referensi '{level}' berhasil dimuat dari {table_name}.") + return gdf + + +# ============================================================ +# FUNGSI: AUTO ATTACH POLYGON KE DATAFRAME NON-SPASIAL +# ============================================================ +def attach_polygon_geometry_auto(df: pd.DataFrame): + """ + Tambahkan kolom geometry MultiPolygon berdasarkan kombinasi + (desa/kelurahan + kecamatan + kabupaten/kota), tanpa duplikasi baris. + """ + level = detect_smallest_admin_level(df) + if not level: + print("[WARN] Tidak ditemukan kolom administratif (desa/kecamatan/kabupaten).") + return df + + print(f"[INFO] Detected smallest admin level: {level}") + ref_gdf = get_reference_polygons(level) + + desa_col = next((c for c in df.columns if any(x in c.lower() for x in ['desa', 'kelurahan'])), None) + kec_col = next((c for c in df.columns if 'kec' in c.lower()), None) + kab_col = next((c for c in df.columns if any(x in c.lower() for x in ['kab', 'kota'])), None) + + if desa_col and (not kec_col or not kab_col): + print("[ERROR] Kolom 'Desa' ditemukan tetapi kolom 'Kecamatan' dan/atau 'Kabupaten' tidak lengkap.") + print(f"[DEBUG] Ditemukan: Desa={desa_col}, Kec={kec_col}, Kab={kab_col}") + return df + + elif not desa_col and kec_col and not kab_col: + print("[ERROR] Kolom 'Kecamatan' ditemukan tetapi kolom 'Kabupaten/Kota' tidak ditemukan.") + print(f"[DEBUG] Ditemukan: Desa={desa_col}, Kec={kec_col}, Kab={kab_col}") + return df + + elif kab_col and not desa_col and not kec_col : + print("[INFO] Struktur kolom administratif valid (minimal Kabupaten/Kota ditemukan).") + print(f"[DEBUG] Ditemukan: Desa={desa_col}, Kec={kec_col}, Kab={kab_col}") + + elif not desa_col and not kec_col and not kab_col: + print("[WARN] Tidak ditemukan kolom administratif apapun (Desa/Kecamatan/Kabupaten).") + print(f"[DEBUG] Kolom CSV: {list(df.columns)}") + return df + + # kolom di referensi + desa_ref = "WADMKD" + kec_ref = "WADMKC" + kab_ref = "WADMKK" + + if desa_col is not None: + df[desa_col] = df[desa_col].astype(str).apply(lambda x: normalize_name(x, "desa")) + + if kec_col is not None: + df[kec_col] = df[kec_col].astype(str).apply(lambda x: normalize_name(x, "kecamatan")) + + if kab_col is not None: + df[kab_col] = df[kab_col].astype(str).apply(lambda x: normalize_name(x, "kabupaten")) + + + if desa_ref is not None: + ref_gdf[desa_ref] = ref_gdf[desa_ref].astype(str).apply(lambda x: normalize_name(x, "desa")) + + if kec_ref is not None: + ref_gdf[kec_ref] = ref_gdf[kec_ref].astype(str).apply(lambda x: normalize_name(x, "kecamatan")) + + if kab_ref is not None: + ref_gdf[kab_ref] = ref_gdf[kab_ref].astype(str).apply(lambda x: normalize_name(x, "kabupaten")) + + + + + join_cols = [col for col in [desa_col, kec_col, kab_col] if col] + + if not join_cols: + print("[ERROR] Tidak ada kolom administratif yang bisa digunakan untuk join key.") + else: + join_cols_df = [col for col in [desa_col, kec_col, kab_col] if col] + join_cols_ref = [col for col in [desa_ref, kec_ref, kab_ref] if col] + + common_depth = min(len(join_cols_df), len(join_cols_ref)) + join_cols_df = join_cols_df[-common_depth:] + join_cols_ref = join_cols_ref[-common_depth:] + + # print(f"[DEBUG] Join kolom DF : {join_cols_df}") + # print(f"[DEBUG] Join kolom REF : {join_cols_ref}") + + df["_join_key"] = df[join_cols_df].astype(str).agg("|".join, axis=1) + ref_gdf["_join_key"] = ref_gdf[join_cols_ref].astype(str).agg("|".join, axis=1) + + # print(f"[INFO] Join key berhasil dibuat dari kolom: {join_cols_df}") + + ref_lookup = ref_gdf[["_join_key", "geometry"]].drop_duplicates(subset=["_join_key"]) + df = df.merge(ref_lookup, how="left", on="_join_key") + matched = df["geometry"].notna().sum() + # print(f"[INFO] {matched} dari {len(df)} baris cocok langsung berdasarkan (desa + kec + kab/kota).") + + if matched < len(df): + unmatched = df[df["geometry"].isna()] + # print(f"[INFO] Melakukan fuzzy match untuk {len(unmatched)} baris yang belum cocok...") + + ref_dict = dict(zip(ref_lookup["_join_key"], ref_lookup["geometry"])) + + def find_fuzzy_geom(row): + key = row["_join_key"] + if not isinstance(key, str): + return None + # fuzzy old + # match = process.extractOne(key, list(ref_dict.keys()), scorer=fuzz.token_sort_ratio) + # fuzzy new + match = process.extractOne( + key, list(ref_dict.keys()), scorer=fuzz.token_set_ratio, score_cutoff=80 + ) + + if match and match[1] >= 85: + return ref_dict[match[0]] + return None + + df.loc[df["geometry"].isna(), "geometry"] = df[df["geometry"].isna()].apply(find_fuzzy_geom, axis=1) + + df = df.drop(columns=["_join_key"], errors="ignore") + + # admin_cols = [col for col in [desa_col, kec_col, kab_col] if col and col in df.columns] + # if matched < len(df): + # diff = df[df['geometry'].isna()][admin_cols] + + # print("[DEBUG] Baris yang tidak match:") + # if diff.empty: + # print("(semua baris berhasil match)") + # else: + # print(diff.to_string(index=False)) + + + # print(f"[REPORT] Total match: {df['geometry'].notna().sum()} / {len(df)} ({df['geometry'].notna().mean()*100:.2f}%)") + + + return gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") diff --git a/services/reader_csv.py b/services/reader_csv.py new file mode 100644 index 0000000..7cb5516 --- /dev/null +++ b/services/reader_csv.py @@ -0,0 +1,59 @@ +# import pandas as pd + +# def read_csv(path: str): +# df = pd.read_csv(path) +# df.columns = [c.strip() for c in df.columns] + +# return df + + +# services/reader_csv.py +import pandas as pd +import re + +def detect_header_line(path, max_rows=10): + """ + Mendeteksi baris header (nama kolom) di CSV. + Mengembalikan index baris header (0-based). + """ + with open(path, 'r', encoding='utf-8', errors='ignore') as f: + lines = [next(f) for _ in range(max_rows)] + + header_line_idx = 0 + best_score = -1 + + for i, line in enumerate(lines): + # Pisahkan berdasarkan koma / titik koma / tab + cells = re.split(r'[;,|\t]', line.strip()) + # Heuristik: jika banyak huruf & sedikit angka → kemungkinan header + alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1) + digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1) + score = alpha_ratio - digit_ratio # makin tinggi makin mirip header + + if score > best_score: + best_score = score + header_line_idx = i + + return header_line_idx + + +def read_csv(path: str): + """ + Membaca CSV dengan deteksi otomatis baris header. + """ + try: + header_line = detect_header_line(path) + print(f"[INFO] Detected header line: {header_line + 1}") + df = pd.read_csv(path, header=header_line, encoding='utf-8', low_memory=False) + except Exception as e: + print(f"[WARN] Gagal deteksi header otomatis: {e}, fallback ke baris pertama") + df = pd.read_csv(path, encoding='utf-8', low_memory=False) + + # Bersihkan kolom kosong / unnamed + df = df.loc[:, ~df.columns.str.contains('^Unnamed')] + df.columns = [str(c).strip() for c in df.columns] + + # Hapus baris kosong total + df = df.dropna(how='all') + + return df diff --git a/services/reader_gdb.py b/services/reader_gdb.py new file mode 100644 index 0000000..843f2d5 --- /dev/null +++ b/services/reader_gdb.py @@ -0,0 +1,75 @@ +import geopandas as gpd +import fiona +import zipfile +import tempfile +import os +import shutil + +def read_gdb(zip_path: str, layer: str = None): + if not zip_path.lower().endswith(".zip"): + raise ValueError("File GDB harus berupa ZIP yang berisi folder .gdb atau file .gdbtable") + + tmpdir = tempfile.mkdtemp() + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(tmpdir) + + macosx_path = os.path.join(tmpdir, "__MACOSX") + if os.path.exists(macosx_path): + shutil.rmtree(macosx_path) + + gdb_folders = [] + for root, dirs, _ in os.walk(tmpdir): + for d in dirs: + if d.lower().endswith(".gdb"): + gdb_folders.append(os.path.join(root, d)) + + if not gdb_folders: + gdbtable_files = [] + for root, _, files in os.walk(tmpdir): + for f in files: + if f.lower().endswith(".gdbtable"): + gdbtable_files.append(os.path.join(root, f)) + + if gdbtable_files: + first_folder = os.path.dirname(gdbtable_files[0]) + base_name = os.path.basename(first_folder) + gdb_folder_path = os.path.join(tmpdir, f"{base_name}.gdb") + + os.makedirs(gdb_folder_path, exist_ok=True) + + for fpath in os.listdir(first_folder): + if ".gdb" in fpath.lower(): + shutil.move(os.path.join(first_folder, fpath), os.path.join(gdb_folder_path, fpath)) + + gdb_folders.append(gdb_folder_path) + # print(f"[INFO] Rebuilt GDB folder from nested structure: {gdb_folder_path}") + else: + # print("[DEBUG] Isi ZIP:", os.listdir(tmpdir)) + shutil.rmtree(tmpdir) + raise ValueError("Tidak ditemukan folder .gdb atau file .gdbtable di dalam ZIP") + + gdb_path = gdb_folders[0] + + layers = fiona.listlayers(gdb_path) + # print(f"[INFO] Layer tersedia: {layers}") + + chosen_layer = layer or (layers[0] if layers else None) + if not chosen_layer: + shutil.rmtree(tmpdir) + raise ValueError("Tidak ada layer GDB yang bisa dibaca.") + + print(f"[DEBUG] Membaca layer: {chosen_layer}") + + try: + gdf = gpd.read_file(gdb_path, layer=chosen_layer) + except Exception as e: + shutil.rmtree(tmpdir) + raise ValueError(f"Gagal membaca layer dari GDB: {e}") + + if gdf.crs is None: + # print("[WARN] CRS tidak terdeteksi, diasumsikan EPSG:4326") + gdf.set_crs("EPSG:4326", inplace=True) + + + shutil.rmtree(tmpdir) + return gdf diff --git a/services/reader_pdf.py b/services/reader_pdf.py new file mode 100644 index 0000000..a5a5241 --- /dev/null +++ b/services/reader_pdf.py @@ -0,0 +1,250 @@ +import pdfplumber +import re +import pandas as pd + +def is_number(s): + if s is None: + return False + s = str(s).strip().replace(',', '').replace('.', '') + return s.isdigit() + +def row_ratio(row): + non_empty = [c for c in row if c not in (None, '', ' ')] + if not non_empty: + return 0 + num_count = sum(is_number(c) for c in non_empty) + return num_count / len(non_empty) + +def has_mixed_text_and_numbers(row): + non_empty = [c for c in row if c not in (None, '', ' ')] + has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty) + has_num = any(is_number(c) for c in non_empty) + return has_text and has_num + +def is_short_text_row(row): + """Deteksi baris teks pendek (1-2 kolom teks pendek).""" + non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')] + if not non_empty: + return False + text_only = all(not is_number(c) for c in non_empty) + joined = " ".join(non_empty) + return text_only and len(non_empty) <= 2 and len(joined) < 20 + +def detect_header_rows(rows): + if not rows: + return [] + + ratios = [row_ratio(r) for r in rows] + body_start_index = None + + for i in range(1, len(rows)): + row = rows[i] + if has_mixed_text_and_numbers(row): + body_start_index = i + break + if ratios[i] > 0.3: + body_start_index = i + break + if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row): + body_start_index = i + break + if ratios[i - 1] == 0 and ratios[i] > 0: + body_start_index = i + break + + if body_start_index is None: + body_start_index = len(rows) + + potential_headers = rows[:body_start_index] + body_filtered = rows[body_start_index:] + header_filtered = [] + for idx, row in enumerate(potential_headers): + if is_short_text_row(row): + if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0: + header_filtered.append(row) + else: + continue + else: + header_filtered.append(row) + + return header_filtered, body_filtered + + +def merge_multiline_header(header_rows): + final_header = [] + for col in zip(*header_rows): + val = next((v for v in reversed(col) if v and str(v).strip()), '') + val = str(val).replace('\n', ' ').strip() + final_header.append(val) + final_header = [v for v in final_header if v not in ['', None]] + + # header_string = ' | '.join(final_header) + # return header_string + + return final_header + + + +NUMBER_HEADER_KEYWORDS = ["no","no.","no .","no . ","no :","no : ","nomor","nomor.","nomor :","nomor urut","no urut","no. urut","no-urut","no_urut","nomor_urut","nomor-urut","No","NO","NO.","No.","No :","NO :","Nomor","NOMOR","Nomor Urut","NOMOR URUT","No Urut","NO URUT","No. Urut","NO. URUT","No /","No / ","No / Nama","No -","No - ","Nomor /","Nomor -","Number","No. of","No of","Index","Serial","Order","ID","ID No","ID No.","Sr No","Sr. No","S/N","SN","Sl No","Sl. No","N0","N0.","N0 :","NOM0R","NOM0R URUT","N0MOR",] + +def has_number_header(header): + """Periksa apakah header mengandung kolom No/Nomor.""" + header_text = header + return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS) + +def is_numbering_column(col_values): + """Periksa apakah kolom pertama diisi nomor urut seperti 1, 01, 2, dst.""" + numeric_like = 0 + total = 0 + for v in col_values: + if not v or not isinstance(v, str): + continue + total += 1 + if re.fullmatch(r"0*\d{1,3}", v.strip()): + numeric_like += 1 + return total > 0 and (numeric_like / total) > 0.6 + +def is_numeric_value(v): + """Cek apakah suatu nilai termasuk angka (int, float, atau string angka).""" + if v is None: + return False + if isinstance(v, (int, float)): + return True + if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()): + return True + return False + +def cleaning_column(headers, bodies): + cleaned_bodies = [] + + for header, body in zip(headers, bodies): + if not body: + cleaned_bodies.append(body) + continue + + header_has_number = has_number_header(header) + first_col = [row[0] for row in body if row and len(row) > 0] + first_col_is_numbering = is_numbering_column(first_col) + + if not header_has_number and first_col_is_numbering: + new_body = [] + for row in body: + if not row: + continue + first_val = row[0] + if is_numeric_value(first_val) and len(row) > 1: + new_body.append(row[1:]) + else: + new_body.append(row) + body = new_body + + header_len = len(headers) + filtered_body = [row for row in body if len(row) == header_len] + + cleaned_bodies.append(filtered_body) + + return cleaned_bodies + + + + + +def read_pdf(path: str): + pdf_path = path + tables_data = [] + with pdfplumber.open(pdf_path) as pdf: + page = pdf.pages[0] + tables = page.find_tables() + for i, t in enumerate(tables, start=1): + table = t.extract() + if len(table) > 4: + tables_data.append(table) + + print(f"\nTotal tabel valid: {len(tables_data)}\n") + + header_only = [] + body_only = [] + for tbl in tables_data: + head, body = detect_header_rows(tbl) + header_only.append(head) + body_only.append(body) + + clean_header = [] + for h in header_only: + clean_header.append(merge_multiline_header(h)) + + clean_body=[] + for i, raw_body in enumerate(body_only): + con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body] + cleaned = cleaning_column(clean_header[i], [con_body]) + # clean_body.append(con_body) + clean_body.append(cleaned[0]) + + # print(clean_header) + # print(clean_body) + + parsed = [] + # for cols, rows in zip(clean_header, clean_body): + # parsed.append({ + # "title": "", + # "columns": cols, + # "rows": rows + # }) + for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1): + parsed.append({ + "title": str(i), # bisa juga f"Table {i}" kalau mau format tertentu + "columns": cols, + "rows": rows + }) + + + return parsed + + + + + + +def convert_df(payload): + # Validasi dasar + print(f'payload {payload}') + + # Cek apakah keys ada + if "columns" not in payload or "rows" not in payload: + raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.") + + # Pastikan columns dan rows berupa list + if not isinstance(payload["columns"], list): + raise TypeError("'columns' harus berupa list.") + if not isinstance(payload["rows"], list): + raise TypeError("'rows' harus berupa list.") + + # Pastikan setiap baris punya jumlah kolom yang sama + for i, row in enumerate(payload["rows"]): + if len(row) != len(payload["columns"]): + raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.") + + # Konversi menjadi DataFrame + df = pd.DataFrame(payload["rows"], columns=payload["columns"]) + + # Tambahkan atribut title kalau ada + if "title" in payload: + df.attrs["title"] = payload["title"] + + return df + + + + + + + + +def test_read_pdf(): + # single + # parsed = [{'title': 'Tabel 3.49. Potensi Penduduk Terpapar Bencana Banjir di Provinsi Jawa Timur', 'columns': ['No', 'Kabupaten/Kota', 'Jumlah Penduduk Terpapar (Jiwa)', 'Penduduk Umur Rentan', 'Penduduk Miskin', 'Penduduk Disabilitas', 'Kelas'], 'rows': [['1', 'PACITAN', '111.309', '14.142', '9.307', '781', 'SEDANG'], ['2', 'PONOROGO', '381.579', '50.815', '44.256', '2.346', 'SEDANG'], ['3', 'TRENGGALEK', '284.509', '34.304', '33.653', '1.945', 'SEDANG'], ['4', 'TULUNGAGUNG', '777.174', '86.452', '67.952', '3.200', 'SEDANG'], ['5', 'BLITAR', '226.767', '25.032', '22.554', '909', 'SEDANG'], ['6', 'KEDIRI', '545.961', '59.272', '74.578', '2.539', 'SEDANG'], ['7', 'MALANG', '238.170', '23.646', '25.388', '641', 'SEDANG'], ['8', 'LUMAJANG', '267.926', '30.206', '33.738', '970', 'SEDANG'], ['9', 'JEMBER', '1.061.703', '109.355', '105.958', '2.424', 'SEDANG'], ['10', 'BANYUWANGI', '442.290', '51.294', '44.107', '1.168', 'SEDANG'], ['11', 'BONDOWOSO', '143.452', '18.178', '21.676', '517', 'SEDANG'], ['12', 'SITUBONDO', '233.211', '26.799', '54.221', '928', 'SEDANG'], ['13', 'PROBOLINGGO', '326.005', '37.002', '58.562', '1.323', 'SEDANG'], ['14', 'PASURUAN', '485.143', '49.285', '65.076', '1.576', 'SEDANG'], ['15', 'SIDOARJO', '1.930.615', '172.191', '132.673', '3.987', 'SEDANG'], ['16', 'MOJOKERTO', '498.583', '52.453', '49.831', '1.491', 'SEDANG'], ['17', 'JOMBANG', '876.937', '92.415', '107.447', '4.985', 'SEDANG'], ['18', 'NGANJUK', '829.022', '95.454', '117.127', '3.029', 'SEDANG'], ['19', 'MADIUN', '363.763', '44.997', '44.877', '1.695', 'SEDANG'], ['20', 'MAGETAN', '117.247', '15.706', '11.051', '652', 'SEDANG'], ['21', 'NGAWI', '419.065', '49.864', '65.877', '1.572', 'SEDANG'], ['22', 'BOJONEGORO', '910.377', '100.800', '117.977', '3.557', 'SEDANG'], ['23', 'TUBAN', '507.407', '51.775', '60.834', '2.206', 'SEDANG'], ['24', 'LAMONGAN', '884.503', '99.928', '96.031', '3.960', 'SEDANG'], ['25', 'GRESIK', '613.133', '59.848', '49.854', '1.666', 'SEDANG'], ['26', 'BANGKALAN', '312.149', '31.075', '36.099', '1.169', 'SEDANG'], ['27', 'SAMPANG', '239.656', '28.756', '39.790', '1.280', 'SEDANG'], ['28', 'PAMEKASAN', '216.423', '25.831', '30.296', '776', 'SEDANG'], ['29', 'SUMENEP', '217.805', '24.741', '33.293', '1.088', 'SEDANG'], ['1', 'KOTA KEDIRI', '162.064', '17.129', '13.997', '363', 'SEDANG'], ['2', 'KOTA BLITAR', '21.390', '2.242', '1.185', '79', 'SEDANG'], ['3', 'KOTA MALANG', '148.072', '15.499', '6.142', '201', 'SEDANG'], ['4', 'KOTA PROBOLINGGO', '117.911', '12.708', '10.913', '420', 'SEDANG'], ['5', 'KOTA PASURUAN', '199.602', '20.199', '19.721', '516', 'SEDANG'], ['6', 'KOTA MOJOKERTO', '139.962', '14.486', '6.971', '584', 'SEDANG'], ['7', 'KOTA MADIUN', '149.468', '17.255', '6.300', '304', 'SEDANG'], ['8', 'KOTA SURABAYA', '2.469.639', '244.061', '133.953', '3.838', 'SEDANG'], ['9', 'KOTA BATU', '8.858', '939', '529', '13', 'SEDANG'], ['-', 'Provinsi Jawa Timur', '17.878.850', '1.906.134', '1.853.794', '60.698', 'SEDANG']]}] + + # double + parsed = [{"title":"Luas Catchment Area (km2) Pada Wilayah Sungai di Provinsi Jawa Timur","columns":["Wilayah Sungai","Luas (km2)","Jumlah DAS"],"rows":[["Bengawan Solo","13.070,00","94 DAS"],["Brantas","13.880,00","20 DAS"],["Welang -Rejoso","2.601,00","36 DAS"],["Pekalen -Sampean","3.953,00","56 DAS"],["Baru -Bajulmati","3.675,00","60 DAS"],["Bondoyudo -Bedadung","5.364,00","47 DAS"],["Madura","4.575,00","173 DAS"]]},{"title":"Jumlah dan Kepadatan Penduduk Menurut Kabupaten\/kota di Provinsi Jawa Timur Tahun 2021","columns":["Kabupaten\/Kota","Jumlah Penduduk","Persentase","Kepadatan Penduduk (Jiwa per Km2)"],"rows":[["Bangkalan","1.082.759","2,64","1.081,20"],["Banyuwangi","1.749.773","4,27","302,60"],["Blitar","1.228.292","3,00","919,05"],["Bojonegoro","1.343.895","3,28","611,20"],["Bondowoso","801.541","1,96","525,27"],["Gresik","1.283.961","3,13","1.077,83"],["Jember","2.581.486","6,30","834,80"],["Jombang","1.350.483","3,29","1.211,10"],["Kediri","1.671.821","4,08","1.206,18"],["Lamongan","1.379.731","3,37","774,24"],["Lumajang","1.091.856","2,66","609,67"],["Madiun","754.263","1,84","726,94"],["Magetan","689.369","1,68","1.000,77"],["Malang","2.611.907","6,37","739,78"],["Mojokerto","1.126.540","2,75","1.569,37"],["Nganjuk","1.133.556","2,77","925,92"],["Ngawi","896.768","2,19","691,96"],["Pacitan","597.580","1,46","429,94"],["Pamekasan","840.790","2,05","1.061,28"],["Pasuruan","1.603.754","3,91","1.088,01"],["Ponorogo","968.681","2,36","741,89"],["Probolinggo","1.156.570","2,82","681,86"],["Sampang","902.514","2,20","731,92"],["Sidoarjo","1.951.723","4,76","3.076,58"],["Situbondo","666.245","1,63","398,98"],["Sumenep","1.134.750","2,77","567,79"],["Trenggalek","746.734","1,82","650,91"],["Tuban","1.223.257","2,98","666,93"],["Tulungagung","1.126.679","2,75","1.067,28"],["Kota Batu","215.248","0,53","1.574,14"],["Kota Blitar","158.123","0,39","4.854,87"],["Kota Kediri","292.363","0,71","4.611,40"],["Kota Madiun","201.243","0,49","6.045,15"],["Kota Malang","866.356","2,11","5.963,35"],["Kota Mojokerto","139.961","0,34","8.497,94"],["Kota Pasuruan","210.341","0,51","5.960,36"],["Kota Probolinggo","242.246","0,59","4.274,68"],["Kota Surabaya","2.970.843","7,25","8.475,05"],["Provinsi Jawa Timur","40.994.002","100,00","76.228,17"]]}] + # df = convert_df(parsed, table_index=0) + return parsed \ No newline at end of file diff --git a/services/reader_shp.py b/services/reader_shp.py new file mode 100644 index 0000000..ff4bb4e --- /dev/null +++ b/services/reader_shp.py @@ -0,0 +1,60 @@ +import geopandas as gpd +import fiona +import zipfile +import tempfile +import os +import shutil +from shapely.geometry import shape + +def read_shp(path: str): + if not path: + raise ValueError("Path shapefile tidak boleh kosong.") + + tmpdir = None + shp_path = None + + if path.lower().endswith(".zip"): + tmpdir = tempfile.mkdtemp() + with zipfile.ZipFile(path, "r") as zip_ref: + zip_ref.extractall(tmpdir) + + shp_files = [] + for root, _, files in os.walk(tmpdir): + for f in files: + if f.lower().endswith(".shp"): + shp_files.append(os.path.join(root, f)) + + if not shp_files: + raise ValueError("Tidak ditemukan file .shp di dalam ZIP.") + shp_path = shp_files[0] + print(f"[DEBUG] Membaca shapefile: {os.path.basename(shp_path)}") + + else: + shp_path = path + + try: + gdf = gpd.read_file(shp_path) + except Exception as e: + raise ValueError(f"Gagal membaca shapefile: {e}") + + if "geometry" not in gdf.columns or gdf.geometry.is_empty.all(): + print("[WARN] Geometry kosong. Mencoba membangun ulang dari fitur mentah...") + + with fiona.open(shp_path) as src: + features = [] + for feat in src: + geom = shape(feat["geometry"]) if feat["geometry"] else None + props = feat["properties"] + props["geometry"] = geom + features.append(props) + + gdf = gpd.GeoDataFrame(features, geometry="geometry", crs=src.crs) + + if gdf.crs is None: + # print("[WARN] CRS tidak terdeteksi. Diasumsikan EPSG:4326") + gdf.set_crs("EPSG:4326", inplace=True) + + if tmpdir and os.path.exists(tmpdir): + shutil.rmtree(tmpdir) + + return gdf