diff --git a/services/geometry_detector.py b/services/geometry_detector.py index b8bf0b7..02a7e12 100644 --- a/services/geometry_detector.py +++ b/services/geometry_detector.py @@ -132,6 +132,44 @@ def is_geom_empty(g): +import math + +def normalize_dynamic(val, is_lat=False): + if pd.isna(val): + return None + try: + v = float(val) + except: + return None + + av = abs(v) + if av == 0: + return v + + if (-180 <= v <= 180 and not is_lat) or (-90 <= v <= 90 and is_lat): + return v + + for factor in [1, 10, 100, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9]: + nv = v / factor + if (not is_lat and -180 <= nv <= 180) or (is_lat and -90 <= nv <= 90): + return nv + + return None + + + +def normalize_lat(val): + if pd.isna(val): + return None + v = float(val) + av = abs(v) + if av > 1e9: # contoh: -8167413802 (10 digit) + return v / 1e9 + elif av > 1e8: # fallback jika ada variasi + return v / 1e8 + else: + return v + # ============================================================ # FUNGSI UTAMA GEOMETRY DETECTION (LAT/LON / PATH) @@ -149,25 +187,18 @@ def detect_and_build_geometry(df: pd.DataFrame, master_polygons: gpd.GeoDataFram print(f"[INFO] Detected existing geometry in GeoDataFrame ({geom_count} features, {geom_type}).") return df - lat_col = next( - (c for c in df.columns if re.search(r'\b(lat|latitude|y[_\s]*coord|y$)\b', c.lower())), None - ) - lon_col = next( - (c for c in df.columns if re.search(r'\b(lon|long|longitude|x[_\s]*coord|x$)\b', c.lower())), None - ) + lat_col = next((c for c in df.columns if re.search(r'\b(lat|latitude|y[_\s]*coord|y$)\b', c.lower())), None) + lon_col = next((c for c in df.columns if re.search(r'\b(lon|long|longitude|x[_\s]*coord|x$)\b', c.lower())), None) if lat_col and lon_col: df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce') df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce') - lon_median = df[lon_col].abs().median() - lat_median = df[lat_col].abs().median() - - if lon_median > 1000 or lat_median > 1000: - df[lon_col] = df[lon_col] / 1e7 - df[lat_col] = df[lat_col] / 1e7 + df[lon_col] = df[lon_col].apply(lambda x: normalize_dynamic(x, is_lat=False)) + df[lat_col] = df[lat_col].apply(normalize_lat) gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326") + print("[INFO] Geometry dibangun dari kolom lat/lon.") return gdf coord_col = next( diff --git a/services/reader_csv.py b/services/reader_csv.py index d1cfdd8..9eafa50 100644 --- a/services/reader_csv.py +++ b/services/reader_csv.py @@ -47,19 +47,24 @@ def read_csv(path: str): delimiter = detect_delimiter(path) print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") - df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False) + df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',') elif ext in ['.xlsx', '.xls']: # === Baca file Excel === print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") - df = pd.read_excel(path, header=0) # default header baris pertama + pre_df = pd.read_excel(path, header=0, dtype=str) # baca semua sebagai string + df = pre_df.copy() + for col in df.columns: + if df[col].str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any(): + df[col] = df[col].str.replace(',', '', regex=False) + df[col] = pd.to_numeric(df[col], errors='ignore') else: raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)") except Exception as e: print(f"[WARN] Gagal membaca file ({e}), fallback ke default") - df = pd.read_csv(path, encoding='utf-8', low_memory=False) + df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',') # Bersihkan kolom dan baris kosong df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]