Compare commits
3 Commits
c953ae7675
...
52770c1bce
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
52770c1bce | ||
|
|
897cd5d7c3 | ||
|
|
f25b4f3851 |
13
main.py
13
main.py
|
|
@ -17,6 +17,7 @@ from services.geometry_detector import attach_polygon_geometry_auto
|
||||||
from database.connection import engine
|
from database.connection import engine
|
||||||
from database.models import Base
|
from database.models import Base
|
||||||
import time
|
import time
|
||||||
|
from datetime import datetime, timedelta
|
||||||
import pathlib
|
import pathlib
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
|
||||||
|
|
@ -28,7 +29,7 @@ from sqlalchemy import text
|
||||||
|
|
||||||
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
apiVersion = "2.1.0"
|
apiVersion = "2.1.3"
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="ETL Geo Upload Service",
|
title="ETL Geo Upload Service",
|
||||||
version=apiVersion,
|
version=apiVersion,
|
||||||
|
|
@ -203,17 +204,19 @@ def process_data(df: pd.DataFrame, ext: str):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@app.get("/status", tags=["System"])
|
@app.get("/status", tags=["System"])
|
||||||
async def server_status():
|
async def server_status():
|
||||||
|
utc_time = datetime.utcnow()
|
||||||
|
wib_time = utc_time + timedelta(hours=7)
|
||||||
|
formatted_time = wib_time.strftime("%d-%m-%Y %H:%M:%S")
|
||||||
response = {
|
response = {
|
||||||
"status": "success",
|
"status": "success",
|
||||||
"message": "Server is running smoothly ✅",
|
"message": "Server is running smoothly ✅",
|
||||||
"data": {
|
"data": {
|
||||||
"service": "upload_automation",
|
"service": "upload_automation",
|
||||||
"status_code": 200,
|
"status_code": 200,
|
||||||
"timestamp": datetime.utcnow().isoformat() + "Z",
|
"timestamp": f"{formatted_time} WIB"
|
||||||
},
|
},
|
||||||
"meta": {
|
"meta": {
|
||||||
"version": apiVersion,
|
"version": apiVersion,
|
||||||
|
|
@ -225,7 +228,7 @@ async def server_status():
|
||||||
|
|
||||||
|
|
||||||
@app.post("/upload")
|
@app.post("/upload")
|
||||||
async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form("")):
|
async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(""), sheet: Optional[str] = Form("")):
|
||||||
fname = file.filename
|
fname = file.filename
|
||||||
ext = os.path.splitext(fname)[1].lower()
|
ext = os.path.splitext(fname)[1].lower()
|
||||||
contents = await file.read()
|
contents = await file.read()
|
||||||
|
|
@ -242,7 +245,7 @@ async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form("
|
||||||
print('ext', ext)
|
print('ext', ext)
|
||||||
|
|
||||||
if ext == ".csv":
|
if ext == ".csv":
|
||||||
df = read_csv(str(tmp_path))
|
df = read_csv(str(tmp_path), sheet)
|
||||||
elif ext == ".xlsx":
|
elif ext == ".xlsx":
|
||||||
df = read_csv(str(tmp_path))
|
df = read_csv(str(tmp_path))
|
||||||
elif ext == ".pdf":
|
elif ext == ".pdf":
|
||||||
|
|
|
||||||
|
|
@ -69,6 +69,8 @@ def normalize_name(name: str, level: str = None):
|
||||||
if not name:
|
if not name:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
name = re.sub(r'\s*\([^)]*\)\s*', '', name)
|
||||||
|
|
||||||
raw = name.lower()
|
raw = name.lower()
|
||||||
raw = re.sub(r'^(desa|kelurahan|kel|dusun|kampung)\s+', '', raw)
|
raw = re.sub(r'^(desa|kelurahan|kel|dusun|kampung)\s+', '', raw)
|
||||||
raw = re.sub(r'^(kecamatan|kec)\s+', '', raw)
|
raw = re.sub(r'^(kecamatan|kec)\s+', '', raw)
|
||||||
|
|
@ -117,7 +119,6 @@ def normalize_name(name: str, level: str = None):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def is_geom_empty(g):
|
def is_geom_empty(g):
|
||||||
"""True jika geometry None, NaN, atau geometry Shapely kosong."""
|
"""True jika geometry None, NaN, atau geometry Shapely kosong."""
|
||||||
if g is None:
|
if g is None:
|
||||||
|
|
@ -134,7 +135,7 @@ def is_geom_empty(g):
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
|
||||||
def normalize_dynamic(val, is_lat=False):
|
def normalize_lon(val, is_lat=False):
|
||||||
if pd.isna(val):
|
if pd.isna(val):
|
||||||
return None
|
return None
|
||||||
try:
|
try:
|
||||||
|
|
@ -194,7 +195,7 @@ def detect_and_build_geometry(df: pd.DataFrame, master_polygons: gpd.GeoDataFram
|
||||||
df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
|
df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
|
||||||
df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
|
df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
|
||||||
|
|
||||||
df[lon_col] = df[lon_col].apply(lambda x: normalize_dynamic(x, is_lat=False))
|
df[lon_col] = df[lon_col].apply(lambda x: normalize_lon(x, is_lat=False))
|
||||||
df[lat_col] = df[lat_col].apply(normalize_lat)
|
df[lat_col] = df[lat_col].apply(normalize_lat)
|
||||||
|
|
||||||
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326")
|
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326")
|
||||||
|
|
|
||||||
|
|
@ -117,57 +117,149 @@ def detect_delimiter(path, sample_size=2048):
|
||||||
return delim
|
return delim
|
||||||
return ','
|
return ','
|
||||||
|
|
||||||
def read_csv(path: str):
|
# def read_csv(path: str):
|
||||||
|
# ext = os.path.splitext(path)[1].lower()
|
||||||
|
|
||||||
|
# try:
|
||||||
|
# if ext in ['.csv']:
|
||||||
|
# # === Baca file CSV ===
|
||||||
|
# header_line = detect_header_line(path)
|
||||||
|
# delimiter = detect_delimiter(path)
|
||||||
|
# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
|
||||||
|
|
||||||
|
# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
|
||||||
|
|
||||||
|
# elif ext in ['.xlsx', '.xls']:
|
||||||
|
# # === Baca file Excel ===
|
||||||
|
# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
|
||||||
|
# xls = pd.ExcelFile(path)
|
||||||
|
|
||||||
|
# print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
|
||||||
|
|
||||||
|
# # Evaluasi tiap sheet untuk mencari yang paling relevan
|
||||||
|
# best_sheet = None
|
||||||
|
# best_score = -1
|
||||||
|
# best_df = None
|
||||||
|
|
||||||
|
# for sheet_name in xls.sheet_names:
|
||||||
|
# try:
|
||||||
|
# df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
|
||||||
|
# df = df.dropna(how='all').dropna(axis=1, how='all')
|
||||||
|
|
||||||
|
# if len(df) == 0 or len(df.columns) < 2:
|
||||||
|
# continue
|
||||||
|
|
||||||
|
# # hitung "skor relevansi"
|
||||||
|
# text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1)
|
||||||
|
# row_score = len(df)
|
||||||
|
# score = (row_score * 0.7) + (text_ratio * 100)
|
||||||
|
|
||||||
|
# if score > best_score:
|
||||||
|
# best_score = score
|
||||||
|
# best_sheet = sheet_name
|
||||||
|
# best_df = df
|
||||||
|
|
||||||
|
# except Exception as e:
|
||||||
|
# print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
|
||||||
|
# continue
|
||||||
|
|
||||||
|
# if best_df is not None:
|
||||||
|
# print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
|
||||||
|
# df = best_df
|
||||||
|
# else:
|
||||||
|
# raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
|
||||||
|
|
||||||
|
# # Konversi tipe numerik jika ada
|
||||||
|
# for col in df.columns:
|
||||||
|
# if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
|
||||||
|
# df[col] = df[col].astype(str).str.replace(',', '', regex=False)
|
||||||
|
# df[col] = pd.to_numeric(df[col], errors='ignore')
|
||||||
|
|
||||||
|
# else:
|
||||||
|
# raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
|
||||||
|
|
||||||
|
# except Exception as e:
|
||||||
|
# print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
|
||||||
|
# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
|
||||||
|
|
||||||
|
# # Bersihkan kolom dan baris kosong
|
||||||
|
# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
|
||||||
|
# df.columns = [str(c).strip() for c in df.columns]
|
||||||
|
# df = df.dropna(how='all')
|
||||||
|
|
||||||
|
# return df
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def read_csv(path: str, sheet: str = None):
|
||||||
ext = os.path.splitext(path)[1].lower()
|
ext = os.path.splitext(path)[1].lower()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if ext in ['.csv', '.txt']:
|
if ext in ['.csv']:
|
||||||
# === Baca file CSV ===
|
# === Baca file CSV ===
|
||||||
header_line = detect_header_line(path)
|
header_line = detect_header_line(path)
|
||||||
delimiter = detect_delimiter(path)
|
delimiter = detect_delimiter(path)
|
||||||
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
|
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
|
||||||
|
|
||||||
df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
|
df = pd.read_csv(
|
||||||
|
path,
|
||||||
|
header=header_line,
|
||||||
|
sep=delimiter,
|
||||||
|
encoding='utf-8',
|
||||||
|
low_memory=False,
|
||||||
|
thousands=','
|
||||||
|
)
|
||||||
|
|
||||||
elif ext in ['.xlsx', '.xls']:
|
elif ext in ['.xlsx', '.xls']:
|
||||||
# === Baca file Excel ===
|
# === Baca file Excel ===
|
||||||
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
|
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
|
||||||
xls = pd.ExcelFile(path)
|
xls = pd.ExcelFile(path)
|
||||||
|
|
||||||
print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
|
print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
|
||||||
|
|
||||||
# Evaluasi tiap sheet untuk mencari yang paling relevan
|
# === Jika user memberikan nama sheet ===
|
||||||
best_sheet = None
|
if sheet:
|
||||||
best_score = -1
|
if sheet not in xls.sheet_names:
|
||||||
best_df = None
|
raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}")
|
||||||
|
print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
|
||||||
|
df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str)
|
||||||
|
df = df.dropna(how='all').dropna(axis=1, how='all')
|
||||||
|
|
||||||
for sheet_name in xls.sheet_names:
|
else:
|
||||||
try:
|
# === Auto-detect sheet terbaik ===
|
||||||
df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
|
print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...")
|
||||||
df = df.dropna(how='all').dropna(axis=1, how='all')
|
best_sheet = None
|
||||||
|
best_score = -1
|
||||||
|
best_df = None
|
||||||
|
|
||||||
if len(df) == 0 or len(df.columns) < 2:
|
for sheet_name in xls.sheet_names:
|
||||||
|
try:
|
||||||
|
temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
|
||||||
|
temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all')
|
||||||
|
|
||||||
|
if len(temp_df) == 0 or len(temp_df.columns) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# hitung skor relevansi
|
||||||
|
text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1)
|
||||||
|
row_score = len(temp_df)
|
||||||
|
score = (row_score * 0.7) + (text_ratio * 100)
|
||||||
|
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_sheet = sheet_name
|
||||||
|
best_df = temp_df
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# hitung "skor relevansi"
|
if best_df is not None:
|
||||||
text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1)
|
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
|
||||||
row_score = len(df)
|
df = best_df
|
||||||
score = (row_score * 0.7) + (text_ratio * 100)
|
else:
|
||||||
|
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
|
||||||
if score > best_score:
|
|
||||||
best_score = score
|
|
||||||
best_sheet = sheet_name
|
|
||||||
best_df = df
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if best_df is not None:
|
|
||||||
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
|
|
||||||
df = best_df
|
|
||||||
else:
|
|
||||||
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
|
|
||||||
|
|
||||||
# Konversi tipe numerik jika ada
|
# Konversi tipe numerik jika ada
|
||||||
for col in df.columns:
|
for col in df.columns:
|
||||||
|
|
@ -176,7 +268,7 @@ def read_csv(path: str):
|
||||||
df[col] = pd.to_numeric(df[col], errors='ignore')
|
df[col] = pd.to_numeric(df[col], errors='ignore')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)")
|
raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
|
print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
|
||||||
|
|
@ -188,3 +280,4 @@ def read_csv(path: str):
|
||||||
df = df.dropna(how='all')
|
df = df.dropna(how='all')
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user