From 5dbfa493697775a0cc45bcbaf88a71137fa2b20c Mon Sep 17 00:00:00 2001 From: dmsanhrProject Date: Thu, 30 Oct 2025 15:38:20 +0700 Subject: [PATCH] update reader csv --- main.py | 7 +++- requirements.txt | 1 + services/geometry_detector.py | 9 ++++- services/reader_csv.py | 70 ++++++++++++++++++++--------------- 4 files changed, 54 insertions(+), 33 deletions(-) diff --git a/main.py b/main.py index aeafe79..fa5d7be 100644 --- a/main.py +++ b/main.py @@ -11,7 +11,8 @@ from core.config import UPLOAD_FOLDER, MAX_FILE_MB from services.reader_csv import read_csv from services.reader_shp import read_shp from services.reader_gdb import read_gdb -from services.reader_pdf import convert_df, read_pdf +# from services.reader_pdf import convert_df, read_pdf +from testing.test_pdf_multi import convert_df, read_pdf from services.geometry_detector import detect_and_build_geometry from services.geometry_detector import attach_polygon_geometry_auto from database.connection import engine @@ -147,6 +148,8 @@ async def upload_file(file: UploadFile = File(...)): if ext == ".csv": df = read_csv(str(tmp_path)) + elif ext == ".xlsx": + df = read_csv(str(tmp_path)) elif ext == ".pdf": tbl = read_pdf(tmp_path) if len(tbl) > 1: @@ -418,7 +421,7 @@ class UploadRequest(BaseModel): @app.post("/upload_to_postgis") def upload_to_postgis(payload: UploadRequest): try: - table_name = payload.title.lower().replace(" ", "_") + table_name = payload.title.lower().replace(" ", "_").replace("-","_") df = pd.DataFrame(payload.rows) print(f"[INFO] Diterima {len(df)} baris data dari frontend.") diff --git a/requirements.txt b/requirements.txt index 715164f..dfb34bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,6 +21,7 @@ openpyxl requests pathlib pyarrow +geoalchemy2 # --- jika menggunakan ai --- groq diff --git a/services/geometry_detector.py b/services/geometry_detector.py index 51fb175..b8bf0b7 100644 --- a/services/geometry_detector.py +++ b/services/geometry_detector.py @@ -159,8 +159,15 @@ def detect_and_build_geometry(df: pd.DataFrame, master_polygons: gpd.GeoDataFram if lat_col and lon_col: df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce') df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce') + + lon_median = df[lon_col].abs().median() + lat_median = df[lat_col].abs().median() + + if lon_median > 1000 or lat_median > 1000: + df[lon_col] = df[lon_col] / 1e7 + df[lat_col] = df[lat_col] / 1e7 + gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lon_col], df[lat_col]), crs="EPSG:4326") - print("[INFO] Geometry dibangun dari kolom lat/lon.") return gdf coord_col = next( diff --git a/services/reader_csv.py b/services/reader_csv.py index 7cb5516..d1cfdd8 100644 --- a/services/reader_csv.py +++ b/services/reader_csv.py @@ -1,21 +1,9 @@ -# import pandas as pd - -# def read_csv(path: str): -# df = pd.read_csv(path) -# df.columns = [c.strip() for c in df.columns] - -# return df - - -# services/reader_csv.py import pandas as pd import re +import csv +import os def detect_header_line(path, max_rows=10): - """ - Mendeteksi baris header (nama kolom) di CSV. - Mengembalikan index baris header (0-based). - """ with open(path, 'r', encoding='utf-8', errors='ignore') as f: lines = [next(f) for _ in range(max_rows)] @@ -23,12 +11,10 @@ def detect_header_line(path, max_rows=10): best_score = -1 for i, line in enumerate(lines): - # Pisahkan berdasarkan koma / titik koma / tab cells = re.split(r'[;,|\t]', line.strip()) - # Heuristik: jika banyak huruf & sedikit angka → kemungkinan header alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1) digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1) - score = alpha_ratio - digit_ratio # makin tinggi makin mirip header + score = alpha_ratio - digit_ratio if score > best_score: best_score = score @@ -37,23 +23,47 @@ def detect_header_line(path, max_rows=10): return header_line_idx -def read_csv(path: str): - """ - Membaca CSV dengan deteksi otomatis baris header. - """ +def detect_delimiter(path, sample_size=2048): + with open(path, 'r', encoding='utf-8', errors='ignore') as f: + sample = f.read(sample_size) + sniffer = csv.Sniffer() try: - header_line = detect_header_line(path) - print(f"[INFO] Detected header line: {header_line + 1}") - df = pd.read_csv(path, header=header_line, encoding='utf-8', low_memory=False) + dialect = sniffer.sniff(sample) + return dialect.delimiter + except Exception: + for delim in [',', ';', '\t', '|']: + if delim in sample: + return delim + return ',' + + +def read_csv(path: str): + ext = os.path.splitext(path)[1].lower() # ambil ekstensi file + + try: + if ext in ['.csv', '.txt']: + # === Baca file CSV === + header_line = detect_header_line(path) + delimiter = detect_delimiter(path) + print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") + + df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False) + + elif ext in ['.xlsx', '.xls']: + # === Baca file Excel === + print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") + df = pd.read_excel(path, header=0) # default header baris pertama + + else: + raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)") + except Exception as e: - print(f"[WARN] Gagal deteksi header otomatis: {e}, fallback ke baris pertama") + print(f"[WARN] Gagal membaca file ({e}), fallback ke default") df = pd.read_csv(path, encoding='utf-8', low_memory=False) - # Bersihkan kolom kosong / unnamed - df = df.loc[:, ~df.columns.str.contains('^Unnamed')] + # Bersihkan kolom dan baris kosong + df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')] df.columns = [str(c).strip() for c in df.columns] - - # Hapus baris kosong total df = df.dropna(how='all') - return df + return df \ No newline at end of file