Compare commits

..

No commits in common. "52770c1bce170d01fe754779836d1e3af2a65cc3" and "c953ae7675ca6097a2f62b4a1dd97558695d3251" have entirely different histories.

3 changed files with 41 additions and 138 deletions

13
main.py
View File

@ -17,7 +17,6 @@ from services.geometry_detector import attach_polygon_geometry_auto
from database.connection import engine from database.connection import engine
from database.models import Base from database.models import Base
import time import time
from datetime import datetime, timedelta
import pathlib import pathlib
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
@ -29,7 +28,7 @@ from sqlalchemy import text
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True) UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
apiVersion = "2.1.3" apiVersion = "2.1.0"
app = FastAPI( app = FastAPI(
title="ETL Geo Upload Service", title="ETL Geo Upload Service",
version=apiVersion, version=apiVersion,
@ -204,19 +203,17 @@ def process_data(df: pd.DataFrame, ext: str):
from datetime import datetime from datetime import datetime
@app.get("/status", tags=["System"]) @app.get("/status", tags=["System"])
async def server_status(): async def server_status():
utc_time = datetime.utcnow()
wib_time = utc_time + timedelta(hours=7)
formatted_time = wib_time.strftime("%d-%m-%Y %H:%M:%S")
response = { response = {
"status": "success", "status": "success",
"message": "Server is running smoothly ✅", "message": "Server is running smoothly ✅",
"data": { "data": {
"service": "upload_automation", "service": "upload_automation",
"status_code": 200, "status_code": 200,
"timestamp": f"{formatted_time} WIB" "timestamp": datetime.utcnow().isoformat() + "Z",
}, },
"meta": { "meta": {
"version": apiVersion, "version": apiVersion,
@ -228,7 +225,7 @@ async def server_status():
@app.post("/upload") @app.post("/upload")
async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form("")): async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form("")):
fname = file.filename fname = file.filename
ext = os.path.splitext(fname)[1].lower() ext = os.path.splitext(fname)[1].lower()
contents = await file.read() contents = await file.read()
@ -245,7 +242,7 @@ async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form("
print('ext', ext) print('ext', ext)
if ext == ".csv": if ext == ".csv":
df = read_csv(str(tmp_path), sheet) df = read_csv(str(tmp_path))
elif ext == ".xlsx": elif ext == ".xlsx":
df = read_csv(str(tmp_path)) df = read_csv(str(tmp_path))
elif ext == ".pdf": elif ext == ".pdf":

View File

@ -69,8 +69,6 @@ def normalize_name(name: str, level: str = None):
if not name: if not name:
return None return None
name = re.sub(r'\s*\([^)]*\)\s*', '', name)
raw = name.lower() raw = name.lower()
raw = re.sub(r'^(desa|kelurahan|kel|dusun|kampung)\s+', '', raw) raw = re.sub(r'^(desa|kelurahan|kel|dusun|kampung)\s+', '', raw)
raw = re.sub(r'^(kecamatan|kec)\s+', '', raw) raw = re.sub(r'^(kecamatan|kec)\s+', '', raw)
@ -119,6 +117,7 @@ def normalize_name(name: str, level: str = None):
def is_geom_empty(g): def is_geom_empty(g):
"""True jika geometry None, NaN, atau geometry Shapely kosong.""" """True jika geometry None, NaN, atau geometry Shapely kosong."""
if g is None: if g is None:
@ -135,7 +134,7 @@ def is_geom_empty(g):
import math import math
def normalize_lon(val, is_lat=False): def normalize_dynamic(val, is_lat=False):
if pd.isna(val): if pd.isna(val):
return None return None
try: try:
@ -195,7 +194,7 @@ def detect_and_build_geometry(df: pd.DataFrame, master_polygons: gpd.GeoDataFram
df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce') df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce') df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
df[lon_col] = df[lon_col].apply(lambda x: normalize_lon(x, is_lat=False)) df[lon_col] = df[lon_col].apply(lambda x: normalize_dynamic(x, is_lat=False))
df[lat_col] = df[lat_col].apply(normalize_lat) df[lat_col] = df[lat_col].apply(normalize_lat)
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326") gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326")

View File

@ -117,139 +117,47 @@ def detect_delimiter(path, sample_size=2048):
return delim return delim
return ',' return ','
# def read_csv(path: str): def read_csv(path: str):
# ext = os.path.splitext(path)[1].lower()
# try:
# if ext in ['.csv']:
# # === Baca file CSV ===
# header_line = detect_header_line(path)
# delimiter = detect_delimiter(path)
# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
# elif ext in ['.xlsx', '.xls']:
# # === Baca file Excel ===
# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
# xls = pd.ExcelFile(path)
# print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
# # Evaluasi tiap sheet untuk mencari yang paling relevan
# best_sheet = None
# best_score = -1
# best_df = None
# for sheet_name in xls.sheet_names:
# try:
# df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
# df = df.dropna(how='all').dropna(axis=1, how='all')
# if len(df) == 0 or len(df.columns) < 2:
# continue
# # hitung "skor relevansi"
# text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1)
# row_score = len(df)
# score = (row_score * 0.7) + (text_ratio * 100)
# if score > best_score:
# best_score = score
# best_sheet = sheet_name
# best_df = df
# except Exception as e:
# print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
# continue
# if best_df is not None:
# print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
# df = best_df
# else:
# raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
# # Konversi tipe numerik jika ada
# for col in df.columns:
# if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
# df[col] = df[col].astype(str).str.replace(',', '', regex=False)
# df[col] = pd.to_numeric(df[col], errors='ignore')
# else:
# raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
# except Exception as e:
# print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
# # Bersihkan kolom dan baris kosong
# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
# df.columns = [str(c).strip() for c in df.columns]
# df = df.dropna(how='all')
# return df
def read_csv(path: str, sheet: str = None):
ext = os.path.splitext(path)[1].lower() ext = os.path.splitext(path)[1].lower()
try: try:
if ext in ['.csv']: if ext in ['.csv', '.txt']:
# === Baca file CSV === # === Baca file CSV ===
header_line = detect_header_line(path) header_line = detect_header_line(path)
delimiter = detect_delimiter(path) delimiter = detect_delimiter(path)
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
df = pd.read_csv( df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
path,
header=header_line,
sep=delimiter,
encoding='utf-8',
low_memory=False,
thousands=','
)
elif ext in ['.xlsx', '.xls']: elif ext in ['.xlsx', '.xls']:
# === Baca file Excel === # === Baca file Excel ===
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
xls = pd.ExcelFile(path) xls = pd.ExcelFile(path)
print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}") print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
# === Jika user memberikan nama sheet === # Evaluasi tiap sheet untuk mencari yang paling relevan
if sheet:
if sheet not in xls.sheet_names:
raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}")
print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str)
df = df.dropna(how='all').dropna(axis=1, how='all')
else:
# === Auto-detect sheet terbaik ===
print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...")
best_sheet = None best_sheet = None
best_score = -1 best_score = -1
best_df = None best_df = None
for sheet_name in xls.sheet_names: for sheet_name in xls.sheet_names:
try: try:
temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str) df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all') df = df.dropna(how='all').dropna(axis=1, how='all')
if len(temp_df) == 0 or len(temp_df.columns) < 2: if len(df) == 0 or len(df.columns) < 2:
continue continue
# hitung skor relevansi # hitung "skor relevansi"
text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1) text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1)
row_score = len(temp_df) row_score = len(df)
score = (row_score * 0.7) + (text_ratio * 100) score = (row_score * 0.7) + (text_ratio * 100)
if score > best_score: if score > best_score:
best_score = score best_score = score
best_sheet = sheet_name best_sheet = sheet_name
best_df = temp_df best_df = df
except Exception as e: except Exception as e:
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}") print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
@ -268,7 +176,7 @@ def read_csv(path: str, sheet: str = None):
df[col] = pd.to_numeric(df[col], errors='ignore') df[col] = pd.to_numeric(df[col], errors='ignore')
else: else:
raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)") raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)")
except Exception as e: except Exception as e:
print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.") print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
@ -280,4 +188,3 @@ def read_csv(path: str, sheet: str = None):
df = df.dropna(how='all') df = df.dropna(how='all')
return df return df