Compare commits

...

3 Commits

Author SHA1 Message Date
dmsanhrProject
52770c1bce update main.py 2025-11-04 22:19:53 +07:00
dmsanhrProject
897cd5d7c3 update normalize name 2025-11-04 22:19:25 +07:00
dmsanhrProject
f25b4f3851 update xlsx sheet selector 2025-11-04 22:17:29 +07:00
3 changed files with 138 additions and 41 deletions

13
main.py
View File

@ -17,6 +17,7 @@ from services.geometry_detector import attach_polygon_geometry_auto
from database.connection import engine from database.connection import engine
from database.models import Base from database.models import Base
import time import time
from datetime import datetime, timedelta
import pathlib import pathlib
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
@ -28,7 +29,7 @@ from sqlalchemy import text
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True) UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
apiVersion = "2.1.0" apiVersion = "2.1.3"
app = FastAPI( app = FastAPI(
title="ETL Geo Upload Service", title="ETL Geo Upload Service",
version=apiVersion, version=apiVersion,
@ -203,17 +204,19 @@ def process_data(df: pd.DataFrame, ext: str):
from datetime import datetime from datetime import datetime
@app.get("/status", tags=["System"]) @app.get("/status", tags=["System"])
async def server_status(): async def server_status():
utc_time = datetime.utcnow()
wib_time = utc_time + timedelta(hours=7)
formatted_time = wib_time.strftime("%d-%m-%Y %H:%M:%S")
response = { response = {
"status": "success", "status": "success",
"message": "Server is running smoothly ✅", "message": "Server is running smoothly ✅",
"data": { "data": {
"service": "upload_automation", "service": "upload_automation",
"status_code": 200, "status_code": 200,
"timestamp": datetime.utcnow().isoformat() + "Z", "timestamp": f"{formatted_time} WIB"
}, },
"meta": { "meta": {
"version": apiVersion, "version": apiVersion,
@ -225,7 +228,7 @@ async def server_status():
@app.post("/upload") @app.post("/upload")
async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form("")): async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form("")):
fname = file.filename fname = file.filename
ext = os.path.splitext(fname)[1].lower() ext = os.path.splitext(fname)[1].lower()
contents = await file.read() contents = await file.read()
@ -242,7 +245,7 @@ async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form("
print('ext', ext) print('ext', ext)
if ext == ".csv": if ext == ".csv":
df = read_csv(str(tmp_path)) df = read_csv(str(tmp_path), sheet)
elif ext == ".xlsx": elif ext == ".xlsx":
df = read_csv(str(tmp_path)) df = read_csv(str(tmp_path))
elif ext == ".pdf": elif ext == ".pdf":

View File

@ -69,6 +69,8 @@ def normalize_name(name: str, level: str = None):
if not name: if not name:
return None return None
name = re.sub(r'\s*\([^)]*\)\s*', '', name)
raw = name.lower() raw = name.lower()
raw = re.sub(r'^(desa|kelurahan|kel|dusun|kampung)\s+', '', raw) raw = re.sub(r'^(desa|kelurahan|kel|dusun|kampung)\s+', '', raw)
raw = re.sub(r'^(kecamatan|kec)\s+', '', raw) raw = re.sub(r'^(kecamatan|kec)\s+', '', raw)
@ -117,7 +119,6 @@ def normalize_name(name: str, level: str = None):
def is_geom_empty(g): def is_geom_empty(g):
"""True jika geometry None, NaN, atau geometry Shapely kosong.""" """True jika geometry None, NaN, atau geometry Shapely kosong."""
if g is None: if g is None:
@ -134,7 +135,7 @@ def is_geom_empty(g):
import math import math
def normalize_dynamic(val, is_lat=False): def normalize_lon(val, is_lat=False):
if pd.isna(val): if pd.isna(val):
return None return None
try: try:
@ -194,7 +195,7 @@ def detect_and_build_geometry(df: pd.DataFrame, master_polygons: gpd.GeoDataFram
df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce') df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce') df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
df[lon_col] = df[lon_col].apply(lambda x: normalize_dynamic(x, is_lat=False)) df[lon_col] = df[lon_col].apply(lambda x: normalize_lon(x, is_lat=False))
df[lat_col] = df[lat_col].apply(normalize_lat) df[lat_col] = df[lat_col].apply(normalize_lat)
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326") gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326")

View File

@ -117,57 +117,149 @@ def detect_delimiter(path, sample_size=2048):
return delim return delim
return ',' return ','
def read_csv(path: str): # def read_csv(path: str):
# ext = os.path.splitext(path)[1].lower()
# try:
# if ext in ['.csv']:
# # === Baca file CSV ===
# header_line = detect_header_line(path)
# delimiter = detect_delimiter(path)
# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
# elif ext in ['.xlsx', '.xls']:
# # === Baca file Excel ===
# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
# xls = pd.ExcelFile(path)
# print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
# # Evaluasi tiap sheet untuk mencari yang paling relevan
# best_sheet = None
# best_score = -1
# best_df = None
# for sheet_name in xls.sheet_names:
# try:
# df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
# df = df.dropna(how='all').dropna(axis=1, how='all')
# if len(df) == 0 or len(df.columns) < 2:
# continue
# # hitung "skor relevansi"
# text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1)
# row_score = len(df)
# score = (row_score * 0.7) + (text_ratio * 100)
# if score > best_score:
# best_score = score
# best_sheet = sheet_name
# best_df = df
# except Exception as e:
# print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
# continue
# if best_df is not None:
# print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
# df = best_df
# else:
# raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
# # Konversi tipe numerik jika ada
# for col in df.columns:
# if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
# df[col] = df[col].astype(str).str.replace(',', '', regex=False)
# df[col] = pd.to_numeric(df[col], errors='ignore')
# else:
# raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
# except Exception as e:
# print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
# # Bersihkan kolom dan baris kosong
# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
# df.columns = [str(c).strip() for c in df.columns]
# df = df.dropna(how='all')
# return df
def read_csv(path: str, sheet: str = None):
ext = os.path.splitext(path)[1].lower() ext = os.path.splitext(path)[1].lower()
try: try:
if ext in ['.csv', '.txt']: if ext in ['.csv']:
# === Baca file CSV === # === Baca file CSV ===
header_line = detect_header_line(path) header_line = detect_header_line(path)
delimiter = detect_delimiter(path) delimiter = detect_delimiter(path)
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',') df = pd.read_csv(
path,
header=header_line,
sep=delimiter,
encoding='utf-8',
low_memory=False,
thousands=','
)
elif ext in ['.xlsx', '.xls']: elif ext in ['.xlsx', '.xls']:
# === Baca file Excel === # === Baca file Excel ===
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
xls = pd.ExcelFile(path) xls = pd.ExcelFile(path)
print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}") print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
# Evaluasi tiap sheet untuk mencari yang paling relevan # === Jika user memberikan nama sheet ===
best_sheet = None if sheet:
best_score = -1 if sheet not in xls.sheet_names:
best_df = None raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}")
print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str)
df = df.dropna(how='all').dropna(axis=1, how='all')
for sheet_name in xls.sheet_names: else:
try: # === Auto-detect sheet terbaik ===
df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str) print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...")
df = df.dropna(how='all').dropna(axis=1, how='all') best_sheet = None
best_score = -1
best_df = None
if len(df) == 0 or len(df.columns) < 2: for sheet_name in xls.sheet_names:
try:
temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all')
if len(temp_df) == 0 or len(temp_df.columns) < 2:
continue
# hitung skor relevansi
text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1)
row_score = len(temp_df)
score = (row_score * 0.7) + (text_ratio * 100)
if score > best_score:
best_score = score
best_sheet = sheet_name
best_df = temp_df
except Exception as e:
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
continue continue
# hitung "skor relevansi" if best_df is not None:
text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1) print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
row_score = len(df) df = best_df
score = (row_score * 0.7) + (text_ratio * 100) else:
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
if score > best_score:
best_score = score
best_sheet = sheet_name
best_df = df
except Exception as e:
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
continue
if best_df is not None:
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
df = best_df
else:
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
# Konversi tipe numerik jika ada # Konversi tipe numerik jika ada
for col in df.columns: for col in df.columns:
@ -176,7 +268,7 @@ def read_csv(path: str):
df[col] = pd.to_numeric(df[col], errors='ignore') df[col] = pd.to_numeric(df[col], errors='ignore')
else: else:
raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)") raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
except Exception as e: except Exception as e:
print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.") print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
@ -188,3 +280,4 @@ def read_csv(path: str):
df = df.dropna(how='all') df = df.dropna(how='all')
return df return df