diff --git a/services/reader_csv.py b/services/reader_csv.py index c5cecb9..1c66c78 100644 --- a/services/reader_csv.py +++ b/services/reader_csv.py @@ -117,57 +117,149 @@ def detect_delimiter(path, sample_size=2048): return delim return ',' -def read_csv(path: str): +# def read_csv(path: str): +# ext = os.path.splitext(path)[1].lower() + +# try: +# if ext in ['.csv']: +# # === Baca file CSV === +# header_line = detect_header_line(path) +# delimiter = detect_delimiter(path) +# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") + +# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',') + +# elif ext in ['.xlsx', '.xls']: +# # === Baca file Excel === +# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") +# xls = pd.ExcelFile(path) + +# print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}") + +# # Evaluasi tiap sheet untuk mencari yang paling relevan +# best_sheet = None +# best_score = -1 +# best_df = None + +# for sheet_name in xls.sheet_names: +# try: +# df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str) +# df = df.dropna(how='all').dropna(axis=1, how='all') + +# if len(df) == 0 or len(df.columns) < 2: +# continue + +# # hitung "skor relevansi" +# text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1) +# row_score = len(df) +# score = (row_score * 0.7) + (text_ratio * 100) + +# if score > best_score: +# best_score = score +# best_sheet = sheet_name +# best_df = df + +# except Exception as e: +# print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}") +# continue + +# if best_df is not None: +# print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}") +# df = best_df +# else: +# raise ValueError("Tidak ada sheet valid yang dapat dibaca.") + +# # Konversi tipe numerik jika ada +# for col in df.columns: +# if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any(): +# df[col] = df[col].astype(str).str.replace(',', '', regex=False) +# df[col] = pd.to_numeric(df[col], errors='ignore') + +# else: +# raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)") + +# except Exception as e: +# print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.") +# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',') + +# # Bersihkan kolom dan baris kosong +# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')] +# df.columns = [str(c).strip() for c in df.columns] +# df = df.dropna(how='all') + +# return df + + + + + +def read_csv(path: str, sheet: str = None): ext = os.path.splitext(path)[1].lower() try: - if ext in ['.csv', '.txt']: + if ext in ['.csv']: # === Baca file CSV === header_line = detect_header_line(path) delimiter = detect_delimiter(path) print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'") - df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',') + df = pd.read_csv( + path, + header=header_line, + sep=delimiter, + encoding='utf-8', + low_memory=False, + thousands=',' + ) elif ext in ['.xlsx', '.xls']: # === Baca file Excel === print(f"[INFO] Membaca file Excel: {os.path.basename(path)}") xls = pd.ExcelFile(path) - print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}") - # Evaluasi tiap sheet untuk mencari yang paling relevan - best_sheet = None - best_score = -1 - best_df = None + # === Jika user memberikan nama sheet === + if sheet: + if sheet not in xls.sheet_names: + raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}") + print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'") + df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str) + df = df.dropna(how='all').dropna(axis=1, how='all') - for sheet_name in xls.sheet_names: - try: - df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str) - df = df.dropna(how='all').dropna(axis=1, how='all') + else: + # === Auto-detect sheet terbaik === + print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...") + best_sheet = None + best_score = -1 + best_df = None - if len(df) == 0 or len(df.columns) < 2: + for sheet_name in xls.sheet_names: + try: + temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str) + temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all') + + if len(temp_df) == 0 or len(temp_df.columns) < 2: + continue + + # hitung skor relevansi + text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1) + row_score = len(temp_df) + score = (row_score * 0.7) + (text_ratio * 100) + + if score > best_score: + best_score = score + best_sheet = sheet_name + best_df = temp_df + + except Exception as e: + print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}") continue - # hitung "skor relevansi" - text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1) - row_score = len(df) - score = (row_score * 0.7) + (text_ratio * 100) - - if score > best_score: - best_score = score - best_sheet = sheet_name - best_df = df - - except Exception as e: - print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}") - continue - - if best_df is not None: - print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}") - df = best_df - else: - raise ValueError("Tidak ada sheet valid yang dapat dibaca.") + if best_df is not None: + print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}") + df = best_df + else: + raise ValueError("Tidak ada sheet valid yang dapat dibaca.") # Konversi tipe numerik jika ada for col in df.columns: @@ -176,7 +268,7 @@ def read_csv(path: str): df[col] = pd.to_numeric(df[col], errors='ignore') else: - raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)") + raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)") except Exception as e: print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.") @@ -188,3 +280,4 @@ def read_csv(path: str): df = df.dropna(how='all') return df +