Compare commits

..

No commits in common. "52770c1bce170d01fe754779836d1e3af2a65cc3" and "c953ae7675ca6097a2f62b4a1dd97558695d3251" have entirely different histories.

3 changed files with 41 additions and 138 deletions

13
main.py
View File

@ -17,7 +17,6 @@ from services.geometry_detector import attach_polygon_geometry_auto
from database.connection import engine from database.connection import engine
from database.models import Base from database.models import Base
import time import time
from datetime import datetime, timedelta
import pathlib import pathlib
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
@ -29,7 +28,7 @@ from sqlalchemy import text
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True) UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
apiVersion = "2.1.3" apiVersion = "2.1.0"
app = FastAPI( app = FastAPI(
title="ETL Geo Upload Service", title="ETL Geo Upload Service",
version=apiVersion, version=apiVersion,
@ -204,19 +203,17 @@ def process_data(df: pd.DataFrame, ext: str):
from datetime import datetime from datetime import datetime
@app.get("/status", tags=["System"]) @app.get("/status", tags=["System"])
async def server_status(): async def server_status():
utc_time = datetime.utcnow()
wib_time = utc_time + timedelta(hours=7)
formatted_time = wib_time.strftime("%d-%m-%Y %H:%M:%S")
response = { response = {
"status": "success", "status": "success",
"message": "Server is running smoothly ✅", "message": "Server is running smoothly ✅",
"data": { "data": {
"service": "upload_automation", "service": "upload_automation",
"status_code": 200, "status_code": 200,
"timestamp": f"{formatted_time} WIB" "timestamp": datetime.utcnow().isoformat() + "Z",
}, },
"meta": { "meta": {
"version": apiVersion, "version": apiVersion,
@ -228,7 +225,7 @@ async def server_status():
@app.post("/upload") @app.post("/upload")
async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form("")): async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form("")):
fname = file.filename fname = file.filename
ext = os.path.splitext(fname)[1].lower() ext = os.path.splitext(fname)[1].lower()
contents = await file.read() contents = await file.read()
@ -245,7 +242,7 @@ async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form("
print('ext', ext) print('ext', ext)
if ext == ".csv": if ext == ".csv":
df = read_csv(str(tmp_path), sheet) df = read_csv(str(tmp_path))
elif ext == ".xlsx": elif ext == ".xlsx":
df = read_csv(str(tmp_path)) df = read_csv(str(tmp_path))
elif ext == ".pdf": elif ext == ".pdf":

View File

@ -69,8 +69,6 @@ def normalize_name(name: str, level: str = None):
if not name: if not name:
return None return None
name = re.sub(r'\s*\([^)]*\)\s*', '', name)
raw = name.lower() raw = name.lower()
raw = re.sub(r'^(desa|kelurahan|kel|dusun|kampung)\s+', '', raw) raw = re.sub(r'^(desa|kelurahan|kel|dusun|kampung)\s+', '', raw)
raw = re.sub(r'^(kecamatan|kec)\s+', '', raw) raw = re.sub(r'^(kecamatan|kec)\s+', '', raw)
@ -119,6 +117,7 @@ def normalize_name(name: str, level: str = None):
def is_geom_empty(g): def is_geom_empty(g):
"""True jika geometry None, NaN, atau geometry Shapely kosong.""" """True jika geometry None, NaN, atau geometry Shapely kosong."""
if g is None: if g is None:
@ -135,7 +134,7 @@ def is_geom_empty(g):
import math import math
def normalize_lon(val, is_lat=False): def normalize_dynamic(val, is_lat=False):
if pd.isna(val): if pd.isna(val):
return None return None
try: try:
@ -195,7 +194,7 @@ def detect_and_build_geometry(df: pd.DataFrame, master_polygons: gpd.GeoDataFram
df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce') df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce') df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
df[lon_col] = df[lon_col].apply(lambda x: normalize_lon(x, is_lat=False)) df[lon_col] = df[lon_col].apply(lambda x: normalize_dynamic(x, is_lat=False))
df[lat_col] = df[lat_col].apply(normalize_lat) df[lat_col] = df[lat_col].apply(normalize_lat)
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326") gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326")

View File

@ -117,149 +117,57 @@ def detect_delimiter(path, sample_size=2048):
return delim return delim
return ',' return ','
# def read_csv(path: str): def read_csv(path: str):
# ext = os.path.splitext(path)[1].lower()
# try:
# if ext in ['.csv']:
# # === Baca file CSV ===
# header_line = detect_header_line(path)
# delimiter = detect_delimiter(path)
# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
# elif ext in ['.xlsx', '.xls']:
# # === Baca file Excel ===
# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
# xls = pd.ExcelFile(path)
# print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
# # Evaluasi tiap sheet untuk mencari yang paling relevan
# best_sheet = None
# best_score = -1
# best_df = None
# for sheet_name in xls.sheet_names:
# try:
# df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
# df = df.dropna(how='all').dropna(axis=1, how='all')
# if len(df) == 0 or len(df.columns) < 2:
# continue
# # hitung "skor relevansi"
# text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1)
# row_score = len(df)
# score = (row_score * 0.7) + (text_ratio * 100)
# if score > best_score:
# best_score = score
# best_sheet = sheet_name
# best_df = df
# except Exception as e:
# print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
# continue
# if best_df is not None:
# print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
# df = best_df
# else:
# raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
# # Konversi tipe numerik jika ada
# for col in df.columns:
# if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
# df[col] = df[col].astype(str).str.replace(',', '', regex=False)
# df[col] = pd.to_numeric(df[col], errors='ignore')
# else:
# raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
# except Exception as e:
# print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
# # Bersihkan kolom dan baris kosong
# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
# df.columns = [str(c).strip() for c in df.columns]
# df = df.dropna(how='all')
# return df
def read_csv(path: str, sheet: str = None):
ext = os.path.splitext(path)[1].lower() ext = os.path.splitext(path)[1].lower()
try: try:
if ext in ['.csv']: if ext in ['.csv', '.txt']:
# === Baca file CSV === # === Baca file CSV ===
header_line = detect_header_line(path) header_line = detect_header_line(path)
delimiter = detect_delimiter(path) delimiter = detect_delimiter(path)
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
df = pd.read_csv( df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
path,
header=header_line,
sep=delimiter,
encoding='utf-8',
low_memory=False,
thousands=','
)
elif ext in ['.xlsx', '.xls']: elif ext in ['.xlsx', '.xls']:
# === Baca file Excel === # === Baca file Excel ===
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
xls = pd.ExcelFile(path) xls = pd.ExcelFile(path)
print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}") print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
# === Jika user memberikan nama sheet === # Evaluasi tiap sheet untuk mencari yang paling relevan
if sheet: best_sheet = None
if sheet not in xls.sheet_names: best_score = -1
raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}") best_df = None
print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str)
df = df.dropna(how='all').dropna(axis=1, how='all')
else: for sheet_name in xls.sheet_names:
# === Auto-detect sheet terbaik === try:
print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...") df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
best_sheet = None df = df.dropna(how='all').dropna(axis=1, how='all')
best_score = -1
best_df = None
for sheet_name in xls.sheet_names: if len(df) == 0 or len(df.columns) < 2:
try:
temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all')
if len(temp_df) == 0 or len(temp_df.columns) < 2:
continue
# hitung skor relevansi
text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1)
row_score = len(temp_df)
score = (row_score * 0.7) + (text_ratio * 100)
if score > best_score:
best_score = score
best_sheet = sheet_name
best_df = temp_df
except Exception as e:
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
continue continue
if best_df is not None: # hitung "skor relevansi"
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}") text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1)
df = best_df row_score = len(df)
else: score = (row_score * 0.7) + (text_ratio * 100)
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
if score > best_score:
best_score = score
best_sheet = sheet_name
best_df = df
except Exception as e:
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
continue
if best_df is not None:
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
df = best_df
else:
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
# Konversi tipe numerik jika ada # Konversi tipe numerik jika ada
for col in df.columns: for col in df.columns:
@ -268,7 +176,7 @@ def read_csv(path: str, sheet: str = None):
df[col] = pd.to_numeric(df[col], errors='ignore') df[col] = pd.to_numeric(df[col], errors='ignore')
else: else:
raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)") raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)")
except Exception as e: except Exception as e:
print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.") print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
@ -280,4 +188,3 @@ def read_csv(path: str, sheet: str = None):
df = df.dropna(how='all') df = df.dropna(how='all')
return df return df