# import pandas as pd # def read_csv(path: str): # df = pd.read_csv(path) # df.columns = [c.strip() for c in df.columns] # return df # services/reader_csv.py import pandas as pd import re def detect_header_line(path, max_rows=10): """ Mendeteksi baris header (nama kolom) di CSV. Mengembalikan index baris header (0-based). """ with open(path, 'r', encoding='utf-8', errors='ignore') as f: lines = [next(f) for _ in range(max_rows)] header_line_idx = 0 best_score = -1 for i, line in enumerate(lines): # Pisahkan berdasarkan koma / titik koma / tab cells = re.split(r'[;,|\t]', line.strip()) # Heuristik: jika banyak huruf & sedikit angka → kemungkinan header alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1) digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1) score = alpha_ratio - digit_ratio # makin tinggi makin mirip header if score > best_score: best_score = score header_line_idx = i return header_line_idx def read_csv(path: str): """ Membaca CSV dengan deteksi otomatis baris header. """ try: header_line = detect_header_line(path) print(f"[INFO] Detected header line: {header_line + 1}") df = pd.read_csv(path, header=header_line, encoding='utf-8', low_memory=False) except Exception as e: print(f"[WARN] Gagal deteksi header otomatis: {e}, fallback ke baris pertama") df = pd.read_csv(path, encoding='utf-8', low_memory=False) # Bersihkan kolom kosong / unnamed df = df.loc[:, ~df.columns.str.contains('^Unnamed')] df.columns = [str(c).strip() for c in df.columns] # Hapus baris kosong total df = df.dropna(how='all') return df