# import pandas as pd

# def read_csv(path: str):
#     df = pd.read_csv(path)
#     df.columns = [c.strip() for c in df.columns]

#     return df


# services/reader_csv.py
import pandas as pd
import re

def detect_header_line(path, max_rows=10):
    """
    Mendeteksi baris header (nama kolom) di CSV.
    Mengembalikan index baris header (0-based).
    """
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        lines = [next(f) for _ in range(max_rows)]

    header_line_idx = 0
    best_score = -1

    for i, line in enumerate(lines):
        # Pisahkan berdasarkan koma / titik koma / tab
        cells = re.split(r'[;,|\t]', line.strip())
        # Heuristik: jika banyak huruf & sedikit angka → kemungkinan header
        alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
        digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
        score = alpha_ratio - digit_ratio  # makin tinggi makin mirip header

        if score > best_score:
            best_score = score
            header_line_idx = i

    return header_line_idx


def read_csv(path: str):
    """
    Membaca CSV dengan deteksi otomatis baris header.
    """
    try:
        header_line = detect_header_line(path)
        print(f"[INFO] Detected header line: {header_line + 1}")
        df = pd.read_csv(path, header=header_line, encoding='utf-8', low_memory=False)
    except Exception as e:
        print(f"[WARN] Gagal deteksi header otomatis: {e}, fallback ke baris pertama")
        df = pd.read_csv(path, encoding='utf-8', low_memory=False)

    # Bersihkan kolom kosong / unnamed
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df.columns = [str(c).strip() for c in df.columns]

    # Hapus baris kosong total
    df = df.dropna(how='all')

    return df