Compare commits

..

No commits in common. "52770c1bce170d01fe754779836d1e3af2a65cc3" and "c953ae7675ca6097a2f62b4a1dd97558695d3251" have entirely different histories.

3 changed files with 41 additions and 138 deletions

13
main.py
View File

@ -17,7 +17,6 @@ from services.geometry_detector import attach_polygon_geometry_auto
from database.connection import engine
from database.models import Base
import time
from datetime import datetime, timedelta
import pathlib
from fastapi.middleware.cors import CORSMiddleware
@ -29,7 +28,7 @@ from sqlalchemy import text
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
apiVersion = "2.1.3"
apiVersion = "2.1.0"
app = FastAPI(
title="ETL Geo Upload Service",
version=apiVersion,
@ -204,19 +203,17 @@ def process_data(df: pd.DataFrame, ext: str):
from datetime import datetime
@app.get("/status", tags=["System"])
async def server_status():
utc_time = datetime.utcnow()
wib_time = utc_time + timedelta(hours=7)
formatted_time = wib_time.strftime("%d-%m-%Y %H:%M:%S")
response = {
"status": "success",
"message": "Server is running smoothly ✅",
"data": {
"service": "upload_automation",
"status_code": 200,
"timestamp": f"{formatted_time} WIB"
"timestamp": datetime.utcnow().isoformat() + "Z",
},
"meta": {
"version": apiVersion,
@ -228,7 +225,7 @@ async def server_status():
@app.post("/upload")
async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form("")):
async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form("")):
fname = file.filename
ext = os.path.splitext(fname)[1].lower()
contents = await file.read()
@ -245,7 +242,7 @@ async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form("
print('ext', ext)
if ext == ".csv":
df = read_csv(str(tmp_path), sheet)
df = read_csv(str(tmp_path))
elif ext == ".xlsx":
df = read_csv(str(tmp_path))
elif ext == ".pdf":

View File

@ -68,8 +68,6 @@ def normalize_name(name: str, level: str = None):
name = name.strip()
if not name:
return None
name = re.sub(r'\s*\([^)]*\)\s*', '', name)
raw = name.lower()
raw = re.sub(r'^(desa|kelurahan|kel|dusun|kampung)\s+', '', raw)
@ -119,6 +117,7 @@ def normalize_name(name: str, level: str = None):
def is_geom_empty(g):
"""True jika geometry None, NaN, atau geometry Shapely kosong."""
if g is None:
@ -135,7 +134,7 @@ def is_geom_empty(g):
import math
def normalize_lon(val, is_lat=False):
def normalize_dynamic(val, is_lat=False):
if pd.isna(val):
return None
try:
@ -195,7 +194,7 @@ def detect_and_build_geometry(df: pd.DataFrame, master_polygons: gpd.GeoDataFram
df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
df[lon_col] = df[lon_col].apply(lambda x: normalize_lon(x, is_lat=False))
df[lon_col] = df[lon_col].apply(lambda x: normalize_dynamic(x, is_lat=False))
df[lat_col] = df[lat_col].apply(normalize_lat)
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326")

View File

@ -117,149 +117,57 @@ def detect_delimiter(path, sample_size=2048):
return delim
return ','
# def read_csv(path: str):
# ext = os.path.splitext(path)[1].lower()
# try:
# if ext in ['.csv']:
# # === Baca file CSV ===
# header_line = detect_header_line(path)
# delimiter = detect_delimiter(path)
# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
# elif ext in ['.xlsx', '.xls']:
# # === Baca file Excel ===
# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
# xls = pd.ExcelFile(path)
# print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
# # Evaluasi tiap sheet untuk mencari yang paling relevan
# best_sheet = None
# best_score = -1
# best_df = None
# for sheet_name in xls.sheet_names:
# try:
# df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
# df = df.dropna(how='all').dropna(axis=1, how='all')
# if len(df) == 0 or len(df.columns) < 2:
# continue
# # hitung "skor relevansi"
# text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1)
# row_score = len(df)
# score = (row_score * 0.7) + (text_ratio * 100)
# if score > best_score:
# best_score = score
# best_sheet = sheet_name
# best_df = df
# except Exception as e:
# print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
# continue
# if best_df is not None:
# print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
# df = best_df
# else:
# raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
# # Konversi tipe numerik jika ada
# for col in df.columns:
# if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
# df[col] = df[col].astype(str).str.replace(',', '', regex=False)
# df[col] = pd.to_numeric(df[col], errors='ignore')
# else:
# raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
# except Exception as e:
# print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
# # Bersihkan kolom dan baris kosong
# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
# df.columns = [str(c).strip() for c in df.columns]
# df = df.dropna(how='all')
# return df
def read_csv(path: str, sheet: str = None):
def read_csv(path: str):
ext = os.path.splitext(path)[1].lower()
try:
if ext in ['.csv']:
if ext in ['.csv', '.txt']:
# === Baca file CSV ===
header_line = detect_header_line(path)
delimiter = detect_delimiter(path)
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
df = pd.read_csv(
path,
header=header_line,
sep=delimiter,
encoding='utf-8',
low_memory=False,
thousands=','
)
df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
elif ext in ['.xlsx', '.xls']:
# === Baca file Excel ===
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
xls = pd.ExcelFile(path)
print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
# === Jika user memberikan nama sheet ===
if sheet:
if sheet not in xls.sheet_names:
raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}")
print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str)
df = df.dropna(how='all').dropna(axis=1, how='all')
# Evaluasi tiap sheet untuk mencari yang paling relevan
best_sheet = None
best_score = -1
best_df = None
else:
# === Auto-detect sheet terbaik ===
print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...")
best_sheet = None
best_score = -1
best_df = None
for sheet_name in xls.sheet_names:
try:
df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
df = df.dropna(how='all').dropna(axis=1, how='all')
for sheet_name in xls.sheet_names:
try:
temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all')
if len(temp_df) == 0 or len(temp_df.columns) < 2:
continue
# hitung skor relevansi
text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1)
row_score = len(temp_df)
score = (row_score * 0.7) + (text_ratio * 100)
if score > best_score:
best_score = score
best_sheet = sheet_name
best_df = temp_df
except Exception as e:
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
if len(df) == 0 or len(df.columns) < 2:
continue
if best_df is not None:
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
df = best_df
else:
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
# hitung "skor relevansi"
text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1)
row_score = len(df)
score = (row_score * 0.7) + (text_ratio * 100)
if score > best_score:
best_score = score
best_sheet = sheet_name
best_df = df
except Exception as e:
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
continue
if best_df is not None:
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
df = best_df
else:
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
# Konversi tipe numerik jika ada
for col in df.columns:
@ -268,7 +176,7 @@ def read_csv(path: str, sheet: str = None):
df[col] = pd.to_numeric(df[col], errors='ignore')
else:
raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)")
except Exception as e:
print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
@ -280,4 +188,3 @@ def read_csv(path: str, sheet: str = None):
df = df.dropna(how='all')
return df