Compare commits
No commits in common. "5dbfa493697775a0cc45bcbaf88a71137fa2b20c" and "5061eb5e651605c4d05ae729f4770f5fc249c324" have entirely different histories.
5dbfa49369
...
5061eb5e65
7
main.py
7
main.py
|
|
@ -11,8 +11,7 @@ from core.config import UPLOAD_FOLDER, MAX_FILE_MB
|
||||||
from services.reader_csv import read_csv
|
from services.reader_csv import read_csv
|
||||||
from services.reader_shp import read_shp
|
from services.reader_shp import read_shp
|
||||||
from services.reader_gdb import read_gdb
|
from services.reader_gdb import read_gdb
|
||||||
# from services.reader_pdf import convert_df, read_pdf
|
from services.reader_pdf import convert_df, read_pdf
|
||||||
from testing.test_pdf_multi import convert_df, read_pdf
|
|
||||||
from services.geometry_detector import detect_and_build_geometry
|
from services.geometry_detector import detect_and_build_geometry
|
||||||
from services.geometry_detector import attach_polygon_geometry_auto
|
from services.geometry_detector import attach_polygon_geometry_auto
|
||||||
from database.connection import engine
|
from database.connection import engine
|
||||||
|
|
@ -148,8 +147,6 @@ async def upload_file(file: UploadFile = File(...)):
|
||||||
|
|
||||||
if ext == ".csv":
|
if ext == ".csv":
|
||||||
df = read_csv(str(tmp_path))
|
df = read_csv(str(tmp_path))
|
||||||
elif ext == ".xlsx":
|
|
||||||
df = read_csv(str(tmp_path))
|
|
||||||
elif ext == ".pdf":
|
elif ext == ".pdf":
|
||||||
tbl = read_pdf(tmp_path)
|
tbl = read_pdf(tmp_path)
|
||||||
if len(tbl) > 1:
|
if len(tbl) > 1:
|
||||||
|
|
@ -421,7 +418,7 @@ class UploadRequest(BaseModel):
|
||||||
@app.post("/upload_to_postgis")
|
@app.post("/upload_to_postgis")
|
||||||
def upload_to_postgis(payload: UploadRequest):
|
def upload_to_postgis(payload: UploadRequest):
|
||||||
try:
|
try:
|
||||||
table_name = payload.title.lower().replace(" ", "_").replace("-","_")
|
table_name = payload.title.lower().replace(" ", "_")
|
||||||
|
|
||||||
df = pd.DataFrame(payload.rows)
|
df = pd.DataFrame(payload.rows)
|
||||||
print(f"[INFO] Diterima {len(df)} baris data dari frontend.")
|
print(f"[INFO] Diterima {len(df)} baris data dari frontend.")
|
||||||
|
|
|
||||||
|
|
@ -20,8 +20,6 @@ starlette
|
||||||
openpyxl
|
openpyxl
|
||||||
requests
|
requests
|
||||||
pathlib
|
pathlib
|
||||||
pyarrow
|
|
||||||
geoalchemy2
|
|
||||||
|
|
||||||
# --- jika menggunakan ai ---
|
# --- jika menggunakan ai ---
|
||||||
groq
|
groq
|
||||||
|
|
|
||||||
|
|
@ -159,15 +159,8 @@ def detect_and_build_geometry(df: pd.DataFrame, master_polygons: gpd.GeoDataFram
|
||||||
if lat_col and lon_col:
|
if lat_col and lon_col:
|
||||||
df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
|
df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
|
||||||
df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
|
df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
|
||||||
|
|
||||||
lon_median = df[lon_col].abs().median()
|
|
||||||
lat_median = df[lat_col].abs().median()
|
|
||||||
|
|
||||||
if lon_median > 1000 or lat_median > 1000:
|
|
||||||
df[lon_col] = df[lon_col] / 1e7
|
|
||||||
df[lat_col] = df[lat_col] / 1e7
|
|
||||||
|
|
||||||
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326")
|
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326")
|
||||||
|
print("[INFO] Geometry dibangun dari kolom lat/lon.")
|
||||||
return gdf
|
return gdf
|
||||||
|
|
||||||
coord_col = next(
|
coord_col = next(
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,21 @@
|
||||||
|
# import pandas as pd
|
||||||
|
|
||||||
|
# def read_csv(path: str):
|
||||||
|
# df = pd.read_csv(path)
|
||||||
|
# df.columns = [c.strip() for c in df.columns]
|
||||||
|
|
||||||
|
# return df
|
||||||
|
|
||||||
|
|
||||||
|
# services/reader_csv.py
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import re
|
import re
|
||||||
import csv
|
|
||||||
import os
|
|
||||||
|
|
||||||
def detect_header_line(path, max_rows=10):
|
def detect_header_line(path, max_rows=10):
|
||||||
|
"""
|
||||||
|
Mendeteksi baris header (nama kolom) di CSV.
|
||||||
|
Mengembalikan index baris header (0-based).
|
||||||
|
"""
|
||||||
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||||
lines = [next(f) for _ in range(max_rows)]
|
lines = [next(f) for _ in range(max_rows)]
|
||||||
|
|
||||||
|
|
@ -11,10 +23,12 @@ def detect_header_line(path, max_rows=10):
|
||||||
best_score = -1
|
best_score = -1
|
||||||
|
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines):
|
||||||
|
# Pisahkan berdasarkan koma / titik koma / tab
|
||||||
cells = re.split(r'[;,|\t]', line.strip())
|
cells = re.split(r'[;,|\t]', line.strip())
|
||||||
|
# Heuristik: jika banyak huruf & sedikit angka → kemungkinan header
|
||||||
alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
|
alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
|
||||||
digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
|
digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
|
||||||
score = alpha_ratio - digit_ratio
|
score = alpha_ratio - digit_ratio # makin tinggi makin mirip header
|
||||||
|
|
||||||
if score > best_score:
|
if score > best_score:
|
||||||
best_score = score
|
best_score = score
|
||||||
|
|
@ -23,47 +37,23 @@ def detect_header_line(path, max_rows=10):
|
||||||
return header_line_idx
|
return header_line_idx
|
||||||
|
|
||||||
|
|
||||||
def detect_delimiter(path, sample_size=2048):
|
|
||||||
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
||||||
sample = f.read(sample_size)
|
|
||||||
sniffer = csv.Sniffer()
|
|
||||||
try:
|
|
||||||
dialect = sniffer.sniff(sample)
|
|
||||||
return dialect.delimiter
|
|
||||||
except Exception:
|
|
||||||
for delim in [',', ';', '\t', '|']:
|
|
||||||
if delim in sample:
|
|
||||||
return delim
|
|
||||||
return ','
|
|
||||||
|
|
||||||
|
|
||||||
def read_csv(path: str):
|
def read_csv(path: str):
|
||||||
ext = os.path.splitext(path)[1].lower() # ambil ekstensi file
|
"""
|
||||||
|
Membaca CSV dengan deteksi otomatis baris header.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
if ext in ['.csv', '.txt']:
|
|
||||||
# === Baca file CSV ===
|
|
||||||
header_line = detect_header_line(path)
|
header_line = detect_header_line(path)
|
||||||
delimiter = detect_delimiter(path)
|
print(f"[INFO] Detected header line: {header_line + 1}")
|
||||||
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
|
df = pd.read_csv(path, header=header_line, encoding='utf-8', low_memory=False)
|
||||||
|
|
||||||
df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False)
|
|
||||||
|
|
||||||
elif ext in ['.xlsx', '.xls']:
|
|
||||||
# === Baca file Excel ===
|
|
||||||
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
|
|
||||||
df = pd.read_excel(path, header=0) # default header baris pertama
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)")
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[WARN] Gagal membaca file ({e}), fallback ke default")
|
print(f"[WARN] Gagal deteksi header otomatis: {e}, fallback ke baris pertama")
|
||||||
df = pd.read_csv(path, encoding='utf-8', low_memory=False)
|
df = pd.read_csv(path, encoding='utf-8', low_memory=False)
|
||||||
|
|
||||||
# Bersihkan kolom dan baris kosong
|
# Bersihkan kolom kosong / unnamed
|
||||||
df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
|
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
|
||||||
df.columns = [str(c).strip() for c in df.columns]
|
df.columns = [str(c).strip() for c in df.columns]
|
||||||
|
|
||||||
|
# Hapus baris kosong total
|
||||||
df = df.dropna(how='all')
|
df = df.dropna(how='all')
|
||||||
|
|
||||||
return df
|
return df
|
||||||
Loading…
Reference in New Issue
Block a user