60 lines
1.8 KiB
Python
60 lines
1.8 KiB
Python
|
|
# import pandas as pd
|
||
|
|
|
||
|
|
# def read_csv(path: str):
|
||
|
|
# df = pd.read_csv(path)
|
||
|
|
# df.columns = [c.strip() for c in df.columns]
|
||
|
|
|
||
|
|
# return df
|
||
|
|
|
||
|
|
|
||
|
|
# services/reader_csv.py
|
||
|
|
import pandas as pd
|
||
|
|
import re
|
||
|
|
|
||
|
|
def detect_header_line(path, max_rows=10):
|
||
|
|
"""
|
||
|
|
Mendeteksi baris header (nama kolom) di CSV.
|
||
|
|
Mengembalikan index baris header (0-based).
|
||
|
|
"""
|
||
|
|
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
||
|
|
lines = [next(f) for _ in range(max_rows)]
|
||
|
|
|
||
|
|
header_line_idx = 0
|
||
|
|
best_score = -1
|
||
|
|
|
||
|
|
for i, line in enumerate(lines):
|
||
|
|
# Pisahkan berdasarkan koma / titik koma / tab
|
||
|
|
cells = re.split(r'[;,|\t]', line.strip())
|
||
|
|
# Heuristik: jika banyak huruf & sedikit angka → kemungkinan header
|
||
|
|
alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
|
||
|
|
digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
|
||
|
|
score = alpha_ratio - digit_ratio # makin tinggi makin mirip header
|
||
|
|
|
||
|
|
if score > best_score:
|
||
|
|
best_score = score
|
||
|
|
header_line_idx = i
|
||
|
|
|
||
|
|
return header_line_idx
|
||
|
|
|
||
|
|
|
||
|
|
def read_csv(path: str):
|
||
|
|
"""
|
||
|
|
Membaca CSV dengan deteksi otomatis baris header.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
header_line = detect_header_line(path)
|
||
|
|
print(f"[INFO] Detected header line: {header_line + 1}")
|
||
|
|
df = pd.read_csv(path, header=header_line, encoding='utf-8', low_memory=False)
|
||
|
|
except Exception as e:
|
||
|
|
print(f"[WARN] Gagal deteksi header otomatis: {e}, fallback ke baris pertama")
|
||
|
|
df = pd.read_csv(path, encoding='utf-8', low_memory=False)
|
||
|
|
|
||
|
|
# Bersihkan kolom kosong / unnamed
|
||
|
|
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
|
||
|
|
df.columns = [str(c).strip() for c in df.columns]
|
||
|
|
|
||
|
|
# Hapus baris kosong total
|
||
|
|
df = df.dropna(how='all')
|
||
|
|
|
||
|
|
return df
|