fixing lat lon logic

This commit is contained in:
dmsanhrProject 2025-10-30 17:14:53 +07:00
parent 5dbfa49369
commit 25d652d7f0
2 changed files with 51 additions and 15 deletions

View File

@ -132,6 +132,44 @@ def is_geom_empty(g):
import math
def normalize_dynamic(val, is_lat=False):
if pd.isna(val):
return None
try:
v = float(val)
except:
return None
av = abs(v)
if av == 0:
return v
if (-180 <= v <= 180 and not is_lat) or (-90 <= v <= 90 and is_lat):
return v
for factor in [1, 10, 100, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9]:
nv = v / factor
if (not is_lat and -180 <= nv <= 180) or (is_lat and -90 <= nv <= 90):
return nv
return None
def normalize_lat(val):
if pd.isna(val):
return None
v = float(val)
av = abs(v)
if av > 1e9: # contoh: -8167413802 (10 digit)
return v / 1e9
elif av > 1e8: # fallback jika ada variasi
return v / 1e8
else:
return v
# ============================================================ # ============================================================
# FUNGSI UTAMA GEOMETRY DETECTION (LAT/LON / PATH) # FUNGSI UTAMA GEOMETRY DETECTION (LAT/LON / PATH)
@ -149,25 +187,18 @@ def detect_and_build_geometry(df: pd.DataFrame, master_polygons: gpd.GeoDataFram
print(f"[INFO] Detected existing geometry in GeoDataFrame ({geom_count} features, {geom_type}).") print(f"[INFO] Detected existing geometry in GeoDataFrame ({geom_count} features, {geom_type}).")
return df return df
lat_col = next( lat_col = next((c for c in df.columns if re.search(r'\b(lat|latitude|y[_\s]*coord|y$)\b', c.lower())), None)
(c for c in df.columns if re.search(r'\b(lat|latitude|y[_\s]*coord|y$)\b', c.lower())), None lon_col = next((c for c in df.columns if re.search(r'\b(lon|long|longitude|x[_\s]*coord|x$)\b', c.lower())), None)
)
lon_col = next(
(c for c in df.columns if re.search(r'\b(lon|long|longitude|x[_\s]*coord|x$)\b', c.lower())), None
)
if lat_col and lon_col: if lat_col and lon_col:
df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce') df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce') df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
lon_median = df[lon_col].abs().median() df[lon_col] = df[lon_col].apply(lambda x: normalize_dynamic(x, is_lat=False))
lat_median = df[lat_col].abs().median() df[lat_col] = df[lat_col].apply(normalize_lat)
if lon_median > 1000 or lat_median > 1000:
df[lon_col] = df[lon_col] / 1e7
df[lat_col] = df[lat_col] / 1e7
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326") gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326")
print("[INFO] Geometry dibangun dari kolom lat/lon.")
return gdf return gdf
coord_col = next( coord_col = next(

View File

@ -47,19 +47,24 @@ def read_csv(path: str):
delimiter = detect_delimiter(path) delimiter = detect_delimiter(path)
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False) df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
elif ext in ['.xlsx', '.xls']: elif ext in ['.xlsx', '.xls']:
# === Baca file Excel === # === Baca file Excel ===
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
df = pd.read_excel(path, header=0) # default header baris pertama pre_df = pd.read_excel(path, header=0, dtype=str) # baca semua sebagai string
df = pre_df.copy()
for col in df.columns:
if df[col].str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
df[col] = df[col].str.replace(',', '', regex=False)
df[col] = pd.to_numeric(df[col], errors='ignore')
else: else:
raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)") raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)")
except Exception as e: except Exception as e:
print(f"[WARN] Gagal membaca file ({e}), fallback ke default") print(f"[WARN] Gagal membaca file ({e}), fallback ke default")
df = pd.read_csv(path, encoding='utf-8', low_memory=False) df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
# Bersihkan kolom dan baris kosong # Bersihkan kolom dan baris kosong
df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')] df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]