update reader csv

This commit is contained in:
dmsanhrProject 2025-10-30 15:38:20 +07:00
parent 7941affbe6
commit 5dbfa49369
4 changed files with 54 additions and 33 deletions

View File

@ -11,7 +11,8 @@ from core.config import UPLOAD_FOLDER, MAX_FILE_MB
from services.reader_csv import read_csv from services.reader_csv import read_csv
from services.reader_shp import read_shp from services.reader_shp import read_shp
from services.reader_gdb import read_gdb from services.reader_gdb import read_gdb
from services.reader_pdf import convert_df, read_pdf # from services.reader_pdf import convert_df, read_pdf
from testing.test_pdf_multi import convert_df, read_pdf
from services.geometry_detector import detect_and_build_geometry from services.geometry_detector import detect_and_build_geometry
from services.geometry_detector import attach_polygon_geometry_auto from services.geometry_detector import attach_polygon_geometry_auto
from database.connection import engine from database.connection import engine
@ -147,6 +148,8 @@ async def upload_file(file: UploadFile = File(...)):
if ext == ".csv": if ext == ".csv":
df = read_csv(str(tmp_path)) df = read_csv(str(tmp_path))
elif ext == ".xlsx":
df = read_csv(str(tmp_path))
elif ext == ".pdf": elif ext == ".pdf":
tbl = read_pdf(tmp_path) tbl = read_pdf(tmp_path)
if len(tbl) > 1: if len(tbl) > 1:
@ -418,7 +421,7 @@ class UploadRequest(BaseModel):
@app.post("/upload_to_postgis") @app.post("/upload_to_postgis")
def upload_to_postgis(payload: UploadRequest): def upload_to_postgis(payload: UploadRequest):
try: try:
table_name = payload.title.lower().replace(" ", "_") table_name = payload.title.lower().replace(" ", "_").replace("-","_")
df = pd.DataFrame(payload.rows) df = pd.DataFrame(payload.rows)
print(f"[INFO] Diterima {len(df)} baris data dari frontend.") print(f"[INFO] Diterima {len(df)} baris data dari frontend.")

View File

@ -21,6 +21,7 @@ openpyxl
requests requests
pathlib pathlib
pyarrow pyarrow
geoalchemy2
# --- jika menggunakan ai --- # --- jika menggunakan ai ---
groq groq

View File

@ -159,8 +159,15 @@ def detect_and_build_geometry(df: pd.DataFrame, master_polygons: gpd.GeoDataFram
if lat_col and lon_col: if lat_col and lon_col:
df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce') df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce') df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
lon_median = df[lon_col].abs().median()
lat_median = df[lat_col].abs().median()
if lon_median > 1000 or lat_median > 1000:
df[lon_col] = df[lon_col] / 1e7
df[lat_col] = df[lat_col] / 1e7
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326") gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326")
print("[INFO] Geometry dibangun dari kolom lat/lon.")
return gdf return gdf
coord_col = next( coord_col = next(

View File

@ -1,21 +1,9 @@
# import pandas as pd
# def read_csv(path: str):
# df = pd.read_csv(path)
# df.columns = [c.strip() for c in df.columns]
# return df
# services/reader_csv.py
import pandas as pd import pandas as pd
import re import re
import csv
import os
def detect_header_line(path, max_rows=10): def detect_header_line(path, max_rows=10):
"""
Mendeteksi baris header (nama kolom) di CSV.
Mengembalikan index baris header (0-based).
"""
with open(path, 'r', encoding='utf-8', errors='ignore') as f: with open(path, 'r', encoding='utf-8', errors='ignore') as f:
lines = [next(f) for _ in range(max_rows)] lines = [next(f) for _ in range(max_rows)]
@ -23,12 +11,10 @@ def detect_header_line(path, max_rows=10):
best_score = -1 best_score = -1
for i, line in enumerate(lines): for i, line in enumerate(lines):
# Pisahkan berdasarkan koma / titik koma / tab
cells = re.split(r'[;,|\t]', line.strip()) cells = re.split(r'[;,|\t]', line.strip())
# Heuristik: jika banyak huruf & sedikit angka → kemungkinan header
alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1) alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1) digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
score = alpha_ratio - digit_ratio # makin tinggi makin mirip header score = alpha_ratio - digit_ratio
if score > best_score: if score > best_score:
best_score = score best_score = score
@ -37,23 +23,47 @@ def detect_header_line(path, max_rows=10):
return header_line_idx return header_line_idx
def read_csv(path: str): def detect_delimiter(path, sample_size=2048):
""" with open(path, 'r', encoding='utf-8', errors='ignore') as f:
Membaca CSV dengan deteksi otomatis baris header. sample = f.read(sample_size)
""" sniffer = csv.Sniffer()
try: try:
header_line = detect_header_line(path) dialect = sniffer.sniff(sample)
print(f"[INFO] Detected header line: {header_line + 1}") return dialect.delimiter
df = pd.read_csv(path, header=header_line, encoding='utf-8', low_memory=False) except Exception:
for delim in [',', ';', '\t', '|']:
if delim in sample:
return delim
return ','
def read_csv(path: str):
ext = os.path.splitext(path)[1].lower() # ambil ekstensi file
try:
if ext in ['.csv', '.txt']:
# === Baca file CSV ===
header_line = detect_header_line(path)
delimiter = detect_delimiter(path)
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False)
elif ext in ['.xlsx', '.xls']:
# === Baca file Excel ===
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
df = pd.read_excel(path, header=0) # default header baris pertama
else:
raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)")
except Exception as e: except Exception as e:
print(f"[WARN] Gagal deteksi header otomatis: {e}, fallback ke baris pertama") print(f"[WARN] Gagal membaca file ({e}), fallback ke default")
df = pd.read_csv(path, encoding='utf-8', low_memory=False) df = pd.read_csv(path, encoding='utf-8', low_memory=False)
# Bersihkan kolom kosong / unnamed # Bersihkan kolom dan baris kosong
df = df.loc[:, ~df.columns.str.contains('^Unnamed')] df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
df.columns = [str(c).strip() for c in df.columns] df.columns = [str(c).strip() for c in df.columns]
# Hapus baris kosong total
df = df.dropna(how='all') df = df.dropna(how='all')
return df return df