update xlsx sheet selector
This commit is contained in:
parent
c953ae7675
commit
f25b4f3851
|
|
@ -117,57 +117,149 @@ def detect_delimiter(path, sample_size=2048):
|
|||
return delim
|
||||
return ','
|
||||
|
||||
def read_csv(path: str):
|
||||
# def read_csv(path: str):
|
||||
# ext = os.path.splitext(path)[1].lower()
|
||||
|
||||
# try:
|
||||
# if ext in ['.csv']:
|
||||
# # === Baca file CSV ===
|
||||
# header_line = detect_header_line(path)
|
||||
# delimiter = detect_delimiter(path)
|
||||
# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
|
||||
|
||||
# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
|
||||
|
||||
# elif ext in ['.xlsx', '.xls']:
|
||||
# # === Baca file Excel ===
|
||||
# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
|
||||
# xls = pd.ExcelFile(path)
|
||||
|
||||
# print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
|
||||
|
||||
# # Evaluasi tiap sheet untuk mencari yang paling relevan
|
||||
# best_sheet = None
|
||||
# best_score = -1
|
||||
# best_df = None
|
||||
|
||||
# for sheet_name in xls.sheet_names:
|
||||
# try:
|
||||
# df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
|
||||
# df = df.dropna(how='all').dropna(axis=1, how='all')
|
||||
|
||||
# if len(df) == 0 or len(df.columns) < 2:
|
||||
# continue
|
||||
|
||||
# # hitung "skor relevansi"
|
||||
# text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1)
|
||||
# row_score = len(df)
|
||||
# score = (row_score * 0.7) + (text_ratio * 100)
|
||||
|
||||
# if score > best_score:
|
||||
# best_score = score
|
||||
# best_sheet = sheet_name
|
||||
# best_df = df
|
||||
|
||||
# except Exception as e:
|
||||
# print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
|
||||
# continue
|
||||
|
||||
# if best_df is not None:
|
||||
# print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
|
||||
# df = best_df
|
||||
# else:
|
||||
# raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
|
||||
|
||||
# # Konversi tipe numerik jika ada
|
||||
# for col in df.columns:
|
||||
# if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
|
||||
# df[col] = df[col].astype(str).str.replace(',', '', regex=False)
|
||||
# df[col] = pd.to_numeric(df[col], errors='ignore')
|
||||
|
||||
# else:
|
||||
# raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
|
||||
|
||||
# except Exception as e:
|
||||
# print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
|
||||
# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
|
||||
|
||||
# # Bersihkan kolom dan baris kosong
|
||||
# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
|
||||
# df.columns = [str(c).strip() for c in df.columns]
|
||||
# df = df.dropna(how='all')
|
||||
|
||||
# return df
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def read_csv(path: str, sheet: str = None):
|
||||
ext = os.path.splitext(path)[1].lower()
|
||||
|
||||
try:
|
||||
if ext in ['.csv', '.txt']:
|
||||
if ext in ['.csv']:
|
||||
# === Baca file CSV ===
|
||||
header_line = detect_header_line(path)
|
||||
delimiter = detect_delimiter(path)
|
||||
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
|
||||
|
||||
df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
|
||||
df = pd.read_csv(
|
||||
path,
|
||||
header=header_line,
|
||||
sep=delimiter,
|
||||
encoding='utf-8',
|
||||
low_memory=False,
|
||||
thousands=','
|
||||
)
|
||||
|
||||
elif ext in ['.xlsx', '.xls']:
|
||||
# === Baca file Excel ===
|
||||
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
|
||||
xls = pd.ExcelFile(path)
|
||||
|
||||
print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
|
||||
|
||||
# Evaluasi tiap sheet untuk mencari yang paling relevan
|
||||
best_sheet = None
|
||||
best_score = -1
|
||||
best_df = None
|
||||
# === Jika user memberikan nama sheet ===
|
||||
if sheet:
|
||||
if sheet not in xls.sheet_names:
|
||||
raise ValueError(f"Sheet '{sheet}' tidak ditemukan dalam file {os.path.basename(path)}")
|
||||
print(f"[INFO] Membaca sheet yang ditentukan: '{sheet}'")
|
||||
df = pd.read_excel(xls, sheet_name=sheet, header=0, dtype=str)
|
||||
df = df.dropna(how='all').dropna(axis=1, how='all')
|
||||
|
||||
for sheet_name in xls.sheet_names:
|
||||
try:
|
||||
df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
|
||||
df = df.dropna(how='all').dropna(axis=1, how='all')
|
||||
else:
|
||||
# === Auto-detect sheet terbaik ===
|
||||
print("[INFO] Tidak ada sheet yang ditentukan, mencari sheet paling relevan...")
|
||||
best_sheet = None
|
||||
best_score = -1
|
||||
best_df = None
|
||||
|
||||
if len(df) == 0 or len(df.columns) < 2:
|
||||
for sheet_name in xls.sheet_names:
|
||||
try:
|
||||
temp_df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
|
||||
temp_df = temp_df.dropna(how='all').dropna(axis=1, how='all')
|
||||
|
||||
if len(temp_df) == 0 or len(temp_df.columns) < 2:
|
||||
continue
|
||||
|
||||
# hitung skor relevansi
|
||||
text_ratio = temp_df.applymap(lambda x: isinstance(x, str)).sum().sum() / (temp_df.size or 1)
|
||||
row_score = len(temp_df)
|
||||
score = (row_score * 0.7) + (text_ratio * 100)
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_sheet = sheet_name
|
||||
best_df = temp_df
|
||||
|
||||
except Exception as e:
|
||||
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
|
||||
continue
|
||||
|
||||
# hitung "skor relevansi"
|
||||
text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1)
|
||||
row_score = len(df)
|
||||
score = (row_score * 0.7) + (text_ratio * 100)
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_sheet = sheet_name
|
||||
best_df = df
|
||||
|
||||
except Exception as e:
|
||||
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
|
||||
continue
|
||||
|
||||
if best_df is not None:
|
||||
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
|
||||
df = best_df
|
||||
else:
|
||||
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
|
||||
if best_df is not None:
|
||||
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
|
||||
df = best_df
|
||||
else:
|
||||
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
|
||||
|
||||
# Konversi tipe numerik jika ada
|
||||
for col in df.columns:
|
||||
|
|
@ -176,7 +268,7 @@ def read_csv(path: str):
|
|||
df[col] = pd.to_numeric(df[col], errors='ignore')
|
||||
|
||||
else:
|
||||
raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)")
|
||||
raise ValueError("Format file tidak dikenali (hanya .csv, .xlsx, .xls)")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
|
||||
|
|
@ -188,3 +280,4 @@ def read_csv(path: str):
|
|||
df = df.dropna(how='all')
|
||||
|
||||
return df
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user